In [1]:
import h5py
import warnings

import pandas as pd
import numpy as np
import seaborn as sb
import statsmodels.api as sm
import matplotlib.pyplot as plt

import scipy.cluster.hierarchy as sch
import scipy.spatial.distance as ssd

from scipy.stats import skew, kurtosis
from sklearn.decomposition import PCA
from sklearn.linear_model import LassoLarsCV
from statsmodels import regression
from pykalman import KalmanFilter
In [2]:
# import dataset
x = h5py.File('./X.h5','r')
y = h5py.File('./Y.h5','r')

# convert to pandas DataFrame
df_x = pd.DataFrame(x['X'][:])
df_y = pd.DataFrame(y['Y'][:])

# rename columns for readability
df_x.columns = ['x'+str(i+1) for i in df_x.columns]
df_y.columns = ['y'+str(i+1) for i in df_y.columns]

Exploratory analysis on the X dataset

A first look at the data: the last one ('x56') seems to be a 0/1 variable

In [3]:
df_x.head()
Out[3]:
x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 ... x47 x48 x49 x50 x51 x52 x53 x54 x55 x56
0 0.149521 0.274510 0.149521 0.274510 0.067965 0.107871 0.150392 0.081016 0.126149 0.166094 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 -1.109005 0.0
1 0.141983 0.264706 0.141983 0.264706 0.066707 0.105621 0.144824 0.077886 0.120554 0.152308 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 -0.802934 0.0
2 0.134601 0.254902 0.134601 0.254902 0.065451 0.103384 0.139469 0.074768 0.115044 0.139779 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.874133 0.0
3 0.113303 0.225490 0.113303 0.225490 0.061692 0.096754 0.124568 0.071659 0.109615 0.128341 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.135057 0.0
4 0.113303 0.225490 0.113303 0.225490 0.058175 0.091784 0.120647 0.068782 0.105719 0.125670 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.814749 0.0

5 rows × 56 columns

From raw plots of the 56 X time-series, in different time windows, it can be clearly identified a clustering structure of shapes, made of 6 clusters approximately.

In [14]:
#t1 = 0
#t2 = 1000 #df_x.shape[0]
t2 = df_x.shape[0]
t1 = t2-1000
df_x.loc[t1:t2,:].plot(subplots=True, figsize=(15,5*56), legend=False, title=list(df_x.columns))
plt.show()

The similarity can be confirmed taking a look at the correlation matrix.

In [5]:
df_x_correlation = df_x.corr()
df_x_correlation
Out[5]:
x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 ... x47 x48 x49 x50 x51 x52 x53 x54 x55 x56
x1 1.000000 0.987228 0.981744 0.981849 0.927609 0.943280 0.989798 0.906585 0.916215 0.950220 ... 0.043331 0.034362 -0.011307 -0.017126 -0.033941 -0.031586 -0.036862 -0.032753 0.000546 0.004944
x2 0.987228 1.000000 0.973321 0.995933 0.952242 0.965232 0.984104 0.926358 0.933387 0.942682 ... 0.053782 0.042754 -0.008030 -0.015202 -0.029408 -0.031267 -0.031974 -0.031965 0.000484 0.004390
x3 0.981744 0.973321 1.000000 0.983529 0.920454 0.934992 0.973896 0.909543 0.918504 0.947160 ... 0.046349 0.035092 -0.012132 -0.019046 -0.008794 -0.022064 -0.011547 -0.024195 0.000578 0.003975
x4 0.981849 0.995933 0.983529 1.000000 0.951000 0.963560 0.979757 0.929925 0.936700 0.943940 ... 0.055814 0.044049 -0.008211 -0.015835 -0.019523 -0.026782 -0.022112 -0.028102 0.000502 0.003957
x5 0.927609 0.952242 0.920454 0.951000 1.000000 0.998691 0.960762 0.919653 0.921625 0.907904 ... 0.138973 0.117941 0.062477 0.050581 -0.057177 -0.068946 -0.055014 -0.059152 0.000501 0.004275
x6 0.943280 0.965232 0.934992 0.963560 0.998691 1.000000 0.972220 0.927916 0.930757 0.921291 ... 0.128443 0.108686 0.053117 0.041984 -0.054425 -0.064517 -0.052982 -0.056313 0.000490 0.004380
x7 0.989798 0.984104 0.973896 0.979757 0.960762 0.972220 1.000000 0.924145 0.931976 0.956467 ... 0.080947 0.067148 0.017362 0.009730 -0.047157 -0.047649 -0.048290 -0.044971 0.000470 0.005096
x8 0.906585 0.926358 0.909543 0.929925 0.919653 0.927916 0.924145 1.000000 0.999339 0.970305 ... 0.162913 0.136482 0.094674 0.084104 -0.063073 -0.076364 -0.062887 -0.073505 0.000071 0.003890
x9 0.916215 0.933387 0.918504 0.936700 0.921625 0.930757 0.931976 0.999339 1.000000 0.977602 ... 0.154075 0.128852 0.086485 0.076594 -0.062078 -0.073533 -0.062149 -0.071134 0.000062 0.003830
x10 0.950220 0.942682 0.947160 0.943940 0.907904 0.921291 0.956467 0.970305 0.977602 1.000000 ... 0.111098 0.092074 0.052286 0.045937 -0.065520 -0.061676 -0.066430 -0.060623 0.000113 0.003600
x11 0.876081 0.908003 0.879406 0.911522 0.965373 0.962444 0.912356 0.965572 0.961342 0.915862 ... 0.199718 0.169696 0.121622 0.107187 -0.071726 -0.093091 -0.068401 -0.082403 0.000184 0.003767
x12 0.891590 0.924193 0.894183 0.927415 0.969777 0.968928 0.924802 0.973390 0.970330 0.928016 ... 0.186970 0.158499 0.108293 0.094783 -0.066482 -0.086487 -0.063897 -0.077413 0.000195 0.003782
x13 0.948144 0.972824 0.945958 0.973961 0.967605 0.975328 0.965884 0.979793 0.982691 0.966087 ... 0.124877 0.104167 0.052409 0.042981 -0.049598 -0.060328 -0.049783 -0.057151 0.000264 0.003957
x14 0.913326 0.936721 0.905833 0.935324 0.977418 0.978037 0.947566 0.920258 0.922128 0.908164 ... 0.215774 0.188573 0.130541 0.114984 -0.078984 -0.109295 -0.075337 -0.095820 0.000317 0.003653
x15 0.959706 0.974826 0.949181 0.972296 0.987682 0.992930 0.983518 0.935949 0.940206 0.939424 ... 0.143771 0.123140 0.066605 0.054974 -0.061747 -0.075146 -0.060021 -0.066413 0.000375 0.004173
x16 0.953805 0.905381 0.929109 0.898530 0.849867 0.866846 0.947825 0.825087 0.837586 0.907442 ... 0.006508 0.001591 -0.034670 -0.036924 -0.033155 -0.015527 -0.036596 -0.018198 0.000573 0.006875
x17 0.894134 0.912924 0.896513 0.916163 0.908364 0.916420 0.912956 0.991156 0.990518 0.962516 ... 0.219364 0.191180 0.155272 0.145331 -0.075682 -0.098530 -0.075022 -0.094673 0.000118 0.004730
x18 0.926039 0.937105 0.927018 0.939862 0.919351 0.929639 0.939611 0.994349 0.996887 0.986334 ... 0.162536 0.137981 0.098010 0.089322 -0.068153 -0.078519 -0.068166 -0.076004 0.000123 0.003871
x19 0.922099 0.880792 0.911478 0.879401 0.829218 0.845675 0.918415 0.873010 0.885603 0.957456 ... 0.042419 0.031028 -0.003978 -0.007274 -0.063893 -0.037567 -0.065445 -0.037751 0.000235 0.003297
x20 0.846147 0.876219 0.847784 0.878879 0.925587 0.924705 0.883388 0.945851 0.942049 0.899027 ... 0.311196 0.275661 0.231331 0.214760 -0.098061 -0.144556 -0.093518 -0.130609 0.000090 0.003961
x21 0.900663 0.931623 0.901333 0.933952 0.959968 0.962443 0.930790 0.975973 0.974687 0.939393 ... 0.225078 0.195017 0.143736 0.130058 -0.075747 -0.104071 -0.072804 -0.094070 0.000158 0.003730
x22 0.982275 0.972352 0.971747 0.970058 0.927617 0.942316 0.984548 0.934750 0.944427 0.978003 ... 0.040229 0.028882 -0.014507 -0.019035 -0.049880 -0.034621 -0.051324 -0.034706 0.000504 0.003537
x23 -0.031441 -0.036017 -0.037021 -0.038802 -0.009800 -0.013556 -0.023116 -0.021912 -0.024617 -0.030695 ... 0.501652 0.496844 0.527084 0.490738 0.003830 0.043482 0.005270 0.053172 -0.000786 -0.003390
x24 -0.044092 -0.048117 -0.046076 -0.049647 -0.033912 -0.036481 -0.041043 -0.021669 -0.023660 -0.026981 ... 0.103045 0.164668 0.575554 0.555915 -0.011734 0.020036 -0.011512 0.021425 -0.000221 -0.007190
x25 -0.031441 -0.036017 -0.037021 -0.038802 -0.009800 -0.013556 -0.023116 -0.021912 -0.024617 -0.030695 ... 0.501652 0.496844 0.527084 0.490738 0.003830 0.043482 0.005270 0.053172 -0.000786 -0.003390
x26 -0.044215 -0.048275 -0.046154 -0.049808 -0.034049 -0.036626 -0.041160 -0.022009 -0.023995 -0.027260 ... 0.101588 0.162983 0.574625 0.554575 -0.011800 0.019765 -0.011561 0.021160 -0.000201 -0.007195
x27 -0.031441 -0.036017 -0.037021 -0.038802 -0.009800 -0.013556 -0.023116 -0.021912 -0.024617 -0.030695 ... 0.501652 0.496844 0.527084 0.490738 0.003830 0.043482 0.005270 0.053172 -0.000786 -0.003390
x28 -0.044215 -0.048275 -0.046154 -0.049808 -0.034049 -0.036626 -0.041160 -0.022009 -0.023995 -0.027260 ... 0.101588 0.162983 0.574625 0.554575 -0.011800 0.019765 -0.011561 0.021160 -0.000201 -0.007195
x29 -0.038385 -0.046083 -0.036675 -0.046381 -0.074554 -0.071412 -0.052817 -0.083274 -0.080028 -0.059562 ... -0.326517 -0.223205 0.051727 0.062251 0.030054 0.292429 0.026290 0.285730 -0.001097 -0.000153
x30 -0.023577 -0.027101 -0.018189 -0.025293 -0.063863 -0.059635 -0.040128 -0.043392 -0.040389 -0.025820 ... -0.618897 -0.573711 -0.182969 -0.155947 0.016302 0.139404 0.013149 0.129764 -0.000213 -0.001077
x31 -0.037408 -0.044927 -0.036143 -0.045538 -0.072844 -0.069764 -0.051585 -0.083838 -0.080549 -0.060017 ... -0.329954 -0.228152 0.050279 0.058763 0.028342 0.288889 0.024537 0.281780 -0.001073 -0.000130
x32 -0.023408 -0.026777 -0.018607 -0.025360 -0.063232 -0.058966 -0.039493 -0.042588 -0.039472 -0.024556 ... -0.639600 -0.594577 -0.201404 -0.170954 0.014079 0.136355 0.010870 0.126690 -0.000123 -0.000810
x33 -0.037408 -0.044927 -0.036143 -0.045538 -0.072844 -0.069764 -0.051585 -0.083838 -0.080549 -0.060017 ... -0.329954 -0.228152 0.050279 0.058763 0.028342 0.288889 0.024537 0.281780 -0.001073 -0.000130
x34 -0.023333 -0.026737 -0.018573 -0.025345 -0.063245 -0.058962 -0.039433 -0.042417 -0.039304 -0.024404 ... -0.639566 -0.595287 -0.201857 -0.171478 0.013907 0.135960 0.010684 0.126271 -0.000093 -0.000716
x35 -0.020646 -0.028183 -0.016250 -0.027442 -0.092615 -0.084576 -0.048994 -0.110348 -0.103002 -0.066917 ... -0.801975 -0.696218 -0.523474 -0.480994 0.050435 0.403599 0.044005 0.388574 -0.000676 0.005302
x36 0.014544 0.013543 0.020321 0.015693 -0.046876 -0.038919 -0.009585 -0.051319 -0.044704 -0.018061 ... -0.846825 -0.793445 -0.764550 -0.710960 0.029233 0.203517 0.024561 0.188448 0.000383 0.004754
x37 -0.020646 -0.028183 -0.016250 -0.027442 -0.092615 -0.084576 -0.048994 -0.110348 -0.103002 -0.066917 ... -0.801975 -0.696218 -0.523474 -0.480994 0.050435 0.403599 0.044005 0.388574 -0.000676 0.005302
x38 0.013812 0.012857 0.019721 0.015007 -0.046642 -0.038809 -0.009903 -0.050584 -0.044097 -0.017942 ... -0.840990 -0.789032 -0.766320 -0.711088 0.028936 0.204168 0.024234 0.189361 0.000340 0.006083
x39 -0.020646 -0.028183 -0.016250 -0.027442 -0.092615 -0.084576 -0.048994 -0.110348 -0.103002 -0.066917 ... -0.801975 -0.696218 -0.523474 -0.480994 0.050435 0.403599 0.044005 0.388574 -0.000676 0.005302
x40 0.013812 0.012857 0.019721 0.015007 -0.046642 -0.038809 -0.009903 -0.050584 -0.044097 -0.017942 ... -0.840990 -0.789032 -0.766320 -0.711088 0.028936 0.204168 0.024234 0.189361 0.000340 0.006083
x41 -0.007864 -0.013402 -0.001620 -0.011402 -0.081608 -0.072946 -0.036909 -0.090334 -0.082808 -0.048460 ... -0.870101 -0.779984 -0.624292 -0.578765 0.044240 0.334652 0.037984 0.315819 -0.000022 0.003038
x42 0.022327 0.021242 0.027203 0.023373 -0.039088 -0.030653 -0.000368 -0.044107 -0.037481 -0.011603 ... -0.761531 -0.734044 -0.819447 -0.771046 0.026135 0.151353 0.021883 0.136997 0.000439 0.004699
x43 -0.007717 -0.012807 -0.002210 -0.011369 -0.078644 -0.070281 -0.035664 -0.090449 -0.083058 -0.049212 ... -0.875874 -0.780476 -0.638370 -0.588299 0.042463 0.339983 0.036333 0.323522 -0.000279 0.005605
x44 0.025237 0.025169 0.030633 0.027386 -0.033818 -0.025767 0.002644 -0.041535 -0.034922 -0.009580 ... -0.779729 -0.753808 -0.861983 -0.808763 0.027742 0.164862 0.023491 0.150997 0.000293 0.005854
x45 -0.007752 -0.012834 -0.002251 -0.011407 -0.078640 -0.070279 -0.035674 -0.090415 -0.083031 -0.049214 ... -0.875562 -0.780333 -0.638457 -0.588302 0.042389 0.339843 0.036252 0.323424 -0.000286 0.005686
x46 0.026644 0.027027 0.032293 0.029348 -0.030211 -0.022429 0.004702 -0.037849 -0.031437 -0.007121 ... -0.771648 -0.746270 -0.863139 -0.807360 0.028816 0.170446 0.024558 0.157049 0.000317 0.007307
x47 0.043331 0.053782 0.046349 0.055814 0.138973 0.128443 0.080947 0.162913 0.154075 0.111098 ... 1.000000 0.912226 0.764347 0.707422 -0.130866 -0.377009 -0.124408 -0.361890 0.000446 -0.005688
x48 0.034362 0.042754 0.035092 0.044049 0.117941 0.108686 0.067148 0.136482 0.128852 0.092074 ... 0.912226 1.000000 0.721691 0.740868 -0.106458 -0.315606 -0.099714 -0.298231 -0.000029 -0.004755
x49 -0.011307 -0.008030 -0.012132 -0.008211 0.062477 0.053117 0.017362 0.094674 0.086485 0.052286 ... 0.764347 0.721691 1.000000 0.945267 -0.087444 -0.271172 -0.082322 -0.257850 -0.000432 -0.007951
x50 -0.017126 -0.015202 -0.019046 -0.015835 0.050581 0.041984 0.009730 0.084104 0.076594 0.045937 ... 0.707422 0.740868 0.945267 1.000000 -0.082413 -0.256073 -0.077106 -0.241555 -0.000266 -0.006252
x51 -0.033941 -0.029408 -0.008794 -0.019523 -0.057177 -0.054425 -0.047157 -0.063073 -0.062078 -0.065520 ... -0.130866 -0.106458 -0.087444 -0.082413 1.000000 0.564128 0.995007 0.561204 -0.002383 0.006894
x52 -0.031586 -0.031267 -0.022064 -0.026782 -0.068946 -0.064517 -0.047649 -0.076364 -0.073533 -0.061676 ... -0.377009 -0.315606 -0.271172 -0.256073 0.564128 1.000000 0.559483 0.992193 -0.002171 0.006333
x53 -0.036862 -0.031974 -0.011547 -0.022112 -0.055014 -0.052982 -0.048290 -0.062887 -0.062149 -0.066430 ... -0.124408 -0.099714 -0.082322 -0.077106 0.995007 0.559483 1.000000 0.562950 -0.002188 0.007553
x54 -0.032753 -0.031965 -0.024195 -0.028102 -0.059152 -0.056313 -0.044971 -0.073505 -0.071134 -0.060623 ... -0.361890 -0.298231 -0.257850 -0.241555 0.561204 0.992193 0.562950 1.000000 -0.002101 0.006174
x55 0.000546 0.000484 0.000578 0.000502 0.000501 0.000490 0.000470 0.000071 0.000062 0.000113 ... 0.000446 -0.000029 -0.000432 -0.000266 -0.002383 -0.002171 -0.002188 -0.002101 1.000000 -0.000024
x56 0.004944 0.004390 0.003975 0.003957 0.004275 0.004380 0.005096 0.003890 0.003830 0.003600 ... -0.005688 -0.004755 -0.007951 -0.006252 0.006894 0.006333 0.007553 0.006174 -0.000024 1.000000

56 rows × 56 columns

A heatmap is suitable to spot cluster in the correlation structure.

In [9]:
f, ax = plt.subplots(figsize=(20,20))
sb.heatmap(df_x_correlation, vmin=-1.0, vmax=1.0, square=True, cmap='bwr')
plt.show()

As can be seen, there are strong clusters of correlated time series (e.g. x1,...x22). We can employ a clustering approach to group together time-series based on their correlation. Moreover, the last two factors, 'x55' and 'x56' are completely scorrelated from the rest. We leave them out as singleton clusters and analyze the clustering strucuture of {'x1',...,'x54'}

In [7]:
df = df_x.loc[:,:'x54']
df.head()
Out[7]:
x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 ... x45 x46 x47 x48 x49 x50 x51 x52 x53 x54
0 0.149521 0.274510 0.149521 0.274510 0.067965 0.107871 0.150392 0.081016 0.126149 0.166094 ... 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.141983 0.264706 0.141983 0.264706 0.066707 0.105621 0.144824 0.077886 0.120554 0.152308 ... -0.000074 -0.000025 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.134601 0.254902 0.134601 0.254902 0.065451 0.103384 0.139469 0.074768 0.115044 0.139779 ... -0.000220 -0.000073 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.113303 0.225490 0.113303 0.225490 0.061692 0.096754 0.124568 0.071659 0.109615 0.128341 ... -0.000585 -0.000196 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.113303 0.225490 0.113303 0.225490 0.058175 0.091784 0.120647 0.068782 0.105719 0.125670 ... -0.000945 -0.000317 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 54 columns

We can perfrom a hierarchical clustering of the X dataset to better understand the grouping of the time series in X.

In [4]:
df_corr = df.corr()
In [5]:
d = sch.distance.pdist(df_corr)
Z = sch.linkage(d, 'average')
In [8]:
plt.figure(figsize=(25, 10))
labelsize=20
ticksize=15
plt.title('Hierarchical Clustering Dendrogram for X dataset', fontsize=labelsize)
plt.xlabel('X', fontsize=labelsize)
plt.ylabel('distance', fontsize=labelsize)
sch.dendrogram(
    Z,
    leaf_rotation=90.,  # rotates the x axis labels
    leaf_font_size=8.,  # font size for the x axis labels
    labels = df_corr.columns
)
plt.yticks(fontsize=ticksize)
plt.xticks(rotation=-90, fontsize=ticksize)
#plt.savefig('img/dendogram_'+index+'.png')
plt.show()

We can perform a PCA decomposition to alternatively understand the clusters and the 'main shapes' of the different clusters. We could then use the principal components as a reduced-X dataset.

In [4]:
def make_PCA(df, num_pc): 
    """
    Perform PCA analysis on 'df' datateset, keeping the first 'num_pc' principal components.
    Returns:
    - pca: sklearn.decomposition.pca.PCA object
    - percentage: np.array of decreasing fractions of total variance explained.
    - percentage_cum: np.array of cumulative fractions of total variance explained.
    """
    
    pca = PCA(n_components=num_pc) # number of principal components
    pca.fit(np.asarray(df))

    percentage =  pca.explained_variance_ratio_
    percentage_cum = np.cumsum(percentage)
    
    print('{0:.2f}% of the variance is explained by the first {1:3d} PCs'.format(percentage_cum[-1]*100, num_pc))

    return pca, percentage, percentage_cum
In [5]:
def plot_PCA_contributions(df, num_pc=None):
    """
    For dataset 'df', plots the fraction and cumulative fractions of the total variance explained by the first 'num_pc' compomnents. 
    If num_pc == None (default), then full PCA decomposition is performed. 

    Returns:
    - pca: sklearn.decomposition.pca.PCA object
    - percentage: np.array of decreasing fractions of total variance explained.
    - percentage_cum: np.array of cumulative fractions of total variance explained.
    """
    
    if num_pc is None:
        num_pc = df.shape[1]
            
    pca, percentage, percentage_cum = make_PCA(df, num_pc)

    x = np.arange(1,len(percentage)+1,1)

    fig = plt.figure(figsize=(20,6))
    plt.subplot(1, 2, 1)
    plt.bar(x, percentage*100, align = "center")
    plt.title('Contribution of principal components to total variance',fontsize = 16)
    plt.xlabel('Principal components',fontsize = 16)
    plt.ylabel("Fraction of total variance (%)",fontsize = 16)
    plt.xticks(x,fontsize = 16) 
    plt.yticks(fontsize = 16)
    plt.xlim([0, num_pc+1])

    plt.subplot(1, 2, 2)
    plt.plot(x, percentage_cum*100,'ro-')
    plt.xlabel('Principal components',fontsize = 16)
    plt.ylabel("Fraction of total variance (%)",fontsize = 16)
    plt.title('Cumulative contribution of principal components to total variance',fontsize = 16)
    plt.xticks(x,fontsize = 16) 
    plt.yticks(fontsize = 16)
    plt.xlim([1, num_pc])

    return pca, percentage, percentage_cum

First, let us perform a full PCA decomposition.

In [8]:
_, _, _ = plot_PCA_contributions(df)
100.00% of the variance is explained by the first  54 PCs

As it is evident, 6 components are enough to explain more than 95% of total variance. Precisely:

In [11]:
num_pc = 6
pca, _, _ = make_PCA(df, num_pc)
96.97% of the variance is explained by the first   6 PCs

We can get the 6 'statistical' factors, projecting the original dataset onto the 6 dimensional space described be the 6 PCs (eigen-vectors of the covariance matrix of X).

In [12]:
df_x_pca = pca.transform(df)

# For clarity: what is implemented by the .transform() method of the PCA class, 
# can reproduced manually as the following projection of the (de-meaned) dataset 
# onto the pca.components_ eigenvectors:
# 
# df_x_pca = (df_x - df_x.mean()).dot(pca.components_.T).values

df_PCx = pd.DataFrame(columns=['PCx'+str(i+1) for i in range(num_pc)], 
                              index=df_x.index,
                              data=df_x_pca)
df_PCx.head()
Out[12]:
PCx1 PCx2 PCx3 PCx4 PCx5 PCx6
0 0.129993 -0.093930 0.014208 -0.884041 0.065279 -0.009696
1 0.129353 -0.089726 0.012468 -0.847450 0.061536 -0.010657
2 0.128760 -0.085720 0.010852 -0.812059 0.057876 -0.011505
3 0.127920 -0.078213 0.007642 -0.744637 0.050210 -0.013154
4 0.127191 -0.075702 0.006780 -0.722951 0.048588 -0.013442

We can take a look at those PCs. We observe the 'key' shapes of the series in X.

In [15]:
df_PCx.loc[t1:t2,:].plot(subplots=True, figsize=(15,5*5), legend=False, title=list(df_PCx.columns))
plt.show()

Exploratory analysis on the Y dataset

A first look at Y dataset clearly shows similarities between each $y_i$ $i=1,5$ and clusters synchrounously sampled X time series.

In [3]:
t1 = 0
t2 = 1000 #df_x.shape[0]
#t2 = df_y.shape[0]
#t1 = t2-1000
df_y.loc[t1:t2,:].plot(subplots=True, figsize=(15,5*6), legend=False, title=list(df_y.columns))
plt.show()

It is clear that both X and Y series feature structural changes. And these seem to be syncronous.

In [4]:
df_x.loc[t1:t2,'x1'].plot(color='r')
df_y.loc[t1:t2,'y1'].plot(color='b')
plt.legend()
Out[4]:
<matplotlib.legend.Legend at 0x1c39137ac8>

Interestingly, the 'x56' spikes seem to be related with the structural changes in both X and Y.

In [5]:
df_x.loc[t1:t2,'x1'].plot(color='r')
df_y.loc[t1:t2,'y1'].plot(color='b')
df_x.loc[t1:t2,'x56'].plot(color='k')
plt.legend()
Out[5]:
<matplotlib.legend.Legend at 0x1c39205d68>

Let's have a look at returns time series

In [24]:
df_x.loc[t1:t2,'x1'].diff()[1:].plot(color='r')
df_y.loc[t1:t2,'y1'].diff()[1:].plot(color='b')
#df_x.loc[t1:t2,'x56'].plot(color='k')
plt.legend()
plt.show()

plt.scatter(df_x.loc[t1:t2,'x1'].diff()[1:], df_y.loc[t1:t2,'y1'].diff()[1:]) # Plot the raw data
plt.xlabel('X Value')
plt.ylabel('Y Value')
#plt.ylim([-10,10])
#plt.xlim([-10,10])
plt.show()

Analysis

Set the training and test sets to be used (default: 80%/20%)

In [125]:
def splitTrainAndTestSeries(x,y,frac):
    """
    Arg:
        x: pd.DataFrame of X time series
        y: pd.DataFrame of Y time series
        frac: fraction of the total sample to keep as training set (0 < frac <= 1)
    
    return:
        x_trainHalf, x_testHalf: original X split into train and test periods, test half is re-indexed to start from 0.
        y_trainHalf, y_testHalf: original Y split into train and test periods, test half is re-indexed to start from 0.
        T_train, T_test: length of train and test halves
    """

    T = x.shape[0]

    # X
    x_trainHalf = x.loc[:int(frac*T),:]
    x_testHalf  = x.loc[int(frac*T)+1:,:]
    
    # Y
    y_trainHalf = y.loc[:int(frac*T),:]
    y_testHalf  = y.loc[int(frac*T)+1:,:]
    
    # train and test halves length
    T_train = x_trainHalf.shape[0]
    T_test = x_testHalf.shape[0]
    
    # re-indexing of test halves
    x_testHalf.index = range(T_test)
    y_testHalf.index = range(T_test)


    return x_trainHalf, x_testHalf, y_trainHalf, y_testHalf, T_train, T_test

The effective training set will be a window of length $w_{train}$ at the end of the training set. The test set will be a window of same length $w_{test}$ at the beginning of the test set.

In [278]:
def setTrainAndTestSeries(x,y,frac=0.8,w_train=250,w_test=250,offset_train=0,col_x=None,col_y=None):
    """
    Arg:
        x: pd.DataFrame of X time series
        y: pd.DataFrame of Y time series
        frac: fraction of the total sample to keep as training set (0 < frac <= 1)
        w_train: length of the training set
        w_test: length of the test set
        offset_train: the training window is [T_train - w_train - offset_train : T_train - offset_train]. 
                      If offset_train = 0 (default) the training window is [T_train - w_train : T_train], 
                      ending in T_train.
        col_x: str or list of X columns to filter (default: None). If None, all columns are returned.
        col_y: str or list of Y columns to filter (default: None). If None, all columns are returned.
    
    return:
        x_train, x_test: 0 indexed train and test X time-series.
        y_train, y_test: 0 indexed train and test Y time-series.
    """
    
    # split original series in train and test periods
    x_trainHalf, x_testHalf, y_trainHalf, y_testHalf, T_train, _ = splitTrainAndTestSeries(x,y,frac)
    
    if col_x is None:
        x_train = x_trainHalf.loc[T_train - w_train - offset_train : T_train - offset_train - 1, :]
        x_test = x_testHalf.loc[:w_test-1,:]
    else:
        x_train = x_trainHalf.loc[T_train - w_train - offset_train : T_train - offset_train  - 1, col_x]
        x_test = x_testHalf.loc[:w_test-1,col_x]
        
    if col_y is None:
        y_train = y_trainHalf.loc[T_train - w_train - offset_train : T_train - offset_train  - 1 , :]
        y_test = y_testHalf.loc[:w_test-1,:]
    else:
        y_train = y_trainHalf.loc[T_train - w_train - offset_train : T_train - offset_train  - 1, col_y]
        y_test = y_testHalf.loc[:w_test-1,col_y]
    
    # re-index of train set to start from zero
    x_train.index = range(w_train)
    y_train.index = range(w_train)
    
    return x_train, x_test, y_train, y_test
    

 Model 1: Linear regression

The first model I tried is a linear regression model: $$ y_t = \alpha + \beta \cdot PCx_t + \epsilon $$ performed for each $y_{i,t}$, $i=1,...,5$ against each of the 6 PCs 'PCx1',...,'PCx6'. If we denote with $T_{train}$ the end of the training set period, the regression is runned regression is run over $[T_{train}-w_{train}:T_{train}]$ and the prediction is made over $[T_{train}:T_{train}+w_{test}]$ (default $w_{train}=w_{test}=250$).

In [195]:
def runLinearModel(x_train, x_test, y_train, y_test, IO=False):
    
    # Running the linear regression
    linearModelFit = regression.linear_model.OLS(y_train, sm.add_constant(x_train)).fit()
    
    alpha = linearModelFit.params[0]
    beta = linearModelFit.params[1]
    
    y_train_pred = alpha + beta*x_train    
    y_train_pred.name = y_train.name + '_pred [trained]'

    y_test_pred = alpha + beta*x_test
    y_test_pred.name = y_test.name + '_pred [test]'
    
    RMSE_train = np.sqrt(np.mean((y_train_pred.values - y_train.values) ** 2))
    RMSE_test = np.sqrt(np.mean((y_test_pred.values - y_test.values) ** 2))
 
    if IO:
        print('RMSE in-sample = {0:.2f}'.format(RMSE_train))
        print('RMSE out-of-sample = {0:.2f}'.format(RMSE_test))
        print(linearModelFit.summary())

    return linearModelFit, y_train_pred, y_test_pred, RMSE_train, RMSE_test
In [385]:
def plotModel(x_train, x_test, y_train, y_test, y_train_pred, y_test_pred, titleString='', plot_x = True, plotTrain=True, plotTest=True):

    if plotTrain:
        # in-sample
        if plot_x:
            x_train.plot(color='r')
        y_train.plot(color='b')
        y_train_pred.plot(color='k')
        plt.title('In sample'+titleString)
        plt.legend()
        plt.show()

    if plotTest:
        # out-of-sample
        if plot_x:
            x_test.plot(color='r')
        y_test.plot(color='b')
        y_test_pred.plot(color='k')
        plt.title('Out of sample'+titleString)
        plt.legend()
        plt.show()
    
    return None

Results of linear regression model are quite poor, even if $(\alpha, \beta)$ are usually significant.

In [547]:
results = {}
results['Linear regression'] = {}
results['Linear regression']['Train'] = {}
results['Linear regression']['Test'] = {}

# loop over the y to be predicted
for y_i in df_y.columns:
    
    results['Linear regression']['Train'][y_i] = {} 
    results['Linear regression']['Test'][y_i] = {} 
    
    # loop over the singel PCs used as predictors in a linear regression model
    for PCx_j in df_PCx.columns:
        
        # retrieve test series for training and prediction
        x_train, x_test, y_train, y_test = setTrainAndTestSeries(x=df_PCx,y=df_y,
                                                                 col_x=PCx_j,col_y=y_i)
        
        # run the linear regression and retrieve predicted y (both in- and out- of sample)
        linearModelFitted, y_train_pred, y_test_pred, RMSE_train, RMSE_test = runLinearModel(x_train, x_test, 
                                                                                             y_train, y_test, 
                                                                                             IO=True)
        
        # plot model predictions (in- and out- of sample, for comparison)
        plotModel(x_train, x_test, y_train, y_test, y_train_pred, y_test_pred)
        
        results['Linear regression']['Train'][y_i]['LR_'+PCx_j] = RMSE_train
        results['Linear regression']['Test'][y_i]['LR_'+PCx_j] = RMSE_test

trainResultLinearModel = pd.DataFrame(results['Linear regression']['Train'])
testResultLinearModel = pd.DataFrame(results['Linear regression']['Test'])
trainResultLinearModel.name='LR'
testResultLinearModel.name='LR'
RMSE in-sample = 0.15
RMSE out-of-sample = 0.44
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y1   R-squared:                       0.916
Model:                            OLS   Adj. R-squared:                  0.916
Method:                 Least Squares   F-statistic:                     2718.
Date:                Mon, 27 Aug 2018   Prob (F-statistic):          1.20e-135
Time:                        21:20:33   Log-Likelihood:                 116.15
No. Observations:                 250   AIC:                            -228.3
Df Residuals:                     248   BIC:                            -221.3
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.4303      0.014     30.141      0.000       0.402       0.458
PCx1           1.2884      0.025     52.139      0.000       1.240       1.337
==============================================================================
Omnibus:                        0.821   Durbin-Watson:                   0.179
Prob(Omnibus):                  0.663   Jarque-Bera (JB):                0.885
Skew:                          -0.030   Prob(JB):                        0.642
Kurtosis:                       2.715   Cond. No.                         3.09
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
RMSE in-sample = 0.13
RMSE out-of-sample = 0.43
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y1   R-squared:                       0.935
Model:                            OLS   Adj. R-squared:                  0.935
Method:                 Least Squares   F-statistic:                     3595.
Date:                Mon, 27 Aug 2018   Prob (F-statistic):          1.34e-149
Time:                        21:20:34   Log-Likelihood:                 148.53
No. Observations:                 250   AIC:                            -293.1
Df Residuals:                     248   BIC:                            -286.0
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.6558      0.015     42.465      0.000       0.625       0.686
PCx2          -0.9052      0.015    -59.962      0.000      -0.935      -0.875
==============================================================================
Omnibus:                       90.386   Durbin-Watson:                   0.224
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              583.105
Skew:                          -1.269   Prob(JB):                    2.40e-127
Kurtosis:                      10.038   Cond. No.                         3.34
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
RMSE in-sample = 0.30
RMSE out-of-sample = 0.66
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y1   R-squared:                       0.676
Model:                            OLS   Adj. R-squared:                  0.675
Method:                 Least Squares   F-statistic:                     518.2
Date:                Mon, 27 Aug 2018   Prob (F-statistic):           1.11e-62
Time:                        21:20:35   Log-Likelihood:                -53.066
No. Observations:                 250   AIC:                             110.1
Df Residuals:                     248   BIC:                             117.2
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.3302      0.057    -23.527      0.000      -1.442      -1.219
PCx3          -1.3353      0.059    -22.763      0.000      -1.451      -1.220
==============================================================================
Omnibus:                       46.319   Durbin-Watson:                   0.102
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               72.680
Skew:                          -1.059   Prob(JB):                     1.65e-16
Kurtosis:                       4.578   Cond. No.                         5.78
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
RMSE in-sample = 0.14
RMSE out-of-sample = 0.29
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y1   R-squared:                       0.933
Model:                            OLS   Adj. R-squared:                  0.933
Method:                 Least Squares   F-statistic:                     3459.
Date:                Mon, 27 Aug 2018   Prob (F-statistic):          1.19e-147
Time:                        21:20:36   Log-Likelihood:                 144.00
No. Observations:                 250   AIC:                            -284.0
Df Residuals:                     248   BIC:                            -277.0
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.2156      0.009    -24.517      0.000      -0.233      -0.198
PCx4          -0.5544      0.009    -58.812      0.000      -0.573      -0.536
==============================================================================
Omnibus:                        7.084   Durbin-Watson:                   0.171
Prob(Omnibus):                  0.029   Jarque-Bera (JB):                7.222
Skew:                           0.322   Prob(JB):                       0.0270
Kurtosis:                       3.528   Cond. No.                         1.23
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
RMSE in-sample = 0.50
RMSE out-of-sample = 0.53
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y1   R-squared:                       0.110
Model:                            OLS   Adj. R-squared:                  0.106
Method:                 Least Squares   F-statistic:                     30.66
Date:                Mon, 27 Aug 2018   Prob (F-statistic):           7.83e-08
Time:                        21:20:36   Log-Likelihood:                -179.49
No. Observations:                 250   AIC:                             363.0
Df Residuals:                     248   BIC:                             370.0
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0912      0.049      1.854      0.065      -0.006       0.188
PCx5          -1.0244      0.185     -5.537      0.000      -1.389      -0.660
==============================================================================
Omnibus:                      437.833   Durbin-Watson:                   0.028
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               20.923
Skew:                          -0.204   Prob(JB):                     2.86e-05
Kurtosis:                       1.643   Cond. No.                         6.13
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
RMSE in-sample = 0.48
RMSE out-of-sample = 0.92
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y1   R-squared:                       0.175
Model:                            OLS   Adj. R-squared:                  0.171
Method:                 Least Squares   F-statistic:                     52.47
Date:                Mon, 27 Aug 2018   Prob (F-statistic):           5.49e-12
Time:                        21:20:37   Log-Likelihood:                -170.07
No. Observations:                 250   AIC:                             344.1
Df Residuals:                     248   BIC:                             351.2
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.0063      0.126     -7.966      0.000      -1.255      -0.758
PCx6          -2.2992      0.317     -7.244      0.000      -2.924      -1.674
==============================================================================
Omnibus:                      221.466   Durbin-Watson:                   0.029
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               19.402
Skew:                          -0.234   Prob(JB):                     6.12e-05
Kurtosis:                       1.718   Cond. No.                         12.0
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
RMSE in-sample = 0.19
RMSE out-of-sample = 0.37
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y2   R-squared:                       0.901
Model:                            OLS   Adj. R-squared:                  0.901
Method:                 Least Squares   F-statistic:                     2266.
Date:                Mon, 27 Aug 2018   Prob (F-statistic):          9.75e-127
Time:                        21:20:38   Log-Likelihood:                 60.455
No. Observations:                 250   AIC:                            -116.9
Df Residuals:                     248   BIC:                            -109.9
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.6536      0.018     36.639      0.000       0.618       0.689
PCx1           1.4699      0.031     47.605      0.000       1.409       1.531
==============================================================================
Omnibus:                       12.550   Durbin-Watson:                   0.119
Prob(Omnibus):                  0.002   Jarque-Bera (JB):               13.373
Skew:                           0.566   Prob(JB):                      0.00125
Kurtosis:                       3.043   Cond. No.                         3.09
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
RMSE in-sample = 0.22
RMSE out-of-sample = 0.34
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y2   R-squared:                       0.872
Model:                            OLS   Adj. R-squared:                  0.872
Method:                 Least Squares   F-statistic:                     1693.
Date:                Mon, 27 Aug 2018   Prob (F-statistic):          8.68e-113
Time:                        21:20:38   Log-Likelihood:                 28.092
No. Observations:                 250   AIC:                            -52.18
Df Residuals:                     248   BIC:                            -45.14
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.8876      0.025     35.502      0.000       0.838       0.937
PCx2          -1.0055      0.024    -41.143      0.000      -1.054      -0.957
==============================================================================
Omnibus:                        9.079   Durbin-Watson:                   0.089
Prob(Omnibus):                  0.011   Jarque-Bera (JB):                9.153
Skew:                          -0.464   Prob(JB):                       0.0103
Kurtosis:                       3.136   Cond. No.                         3.34
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
RMSE in-sample = 0.41
RMSE out-of-sample = 0.54
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y2   R-squared:                       0.543
Model:                            OLS   Adj. R-squared:                  0.542
Method:                 Least Squares   F-statistic:                     295.1
Date:                Mon, 27 Aug 2018   Prob (F-statistic):           4.18e-44
Time:                        21:20:39   Log-Likelihood:                -131.10
No. Observations:                 250   AIC:                             266.2
Df Residuals:                     248   BIC:                             273.2
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.2219      0.077    -15.817      0.000      -1.374      -1.070
PCx3          -1.3769      0.080    -17.179      0.000      -1.535      -1.219
==============================================================================
Omnibus:                       16.065   Durbin-Watson:                   0.055
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               18.017
Skew:                          -0.653   Prob(JB):                     0.000122
Kurtosis:                       2.849   Cond. No.                         5.78
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
RMSE in-sample = 0.19
RMSE out-of-sample = 0.29
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y2   R-squared:                       0.901
Model:                            OLS   Adj. R-squared:                  0.901
Method:                 Least Squares   F-statistic:                     2268.
Date:                Mon, 27 Aug 2018   Prob (F-statistic):          9.13e-127
Time:                        21:20:39   Log-Likelihood:                 60.521
No. Observations:                 250   AIC:                            -117.0
Df Residuals:                     248   BIC:                            -110.0
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0823      0.012     -6.705      0.000      -0.107      -0.058
PCx4          -0.6269      0.013    -47.619      0.000      -0.653      -0.601
==============================================================================
Omnibus:                        3.395   Durbin-Watson:                   0.141
Prob(Omnibus):                  0.183   Jarque-Bera (JB):                3.445
Skew:                          -0.280   Prob(JB):                        0.179
Kurtosis:                       2.867   Cond. No.                         1.23
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
RMSE in-sample = 0.54
RMSE out-of-sample = 0.68
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y2   R-squared:                       0.217
Model:                            OLS   Adj. R-squared:                  0.214
Method:                 Least Squares   F-statistic:                     68.75
Date:                Mon, 27 Aug 2018   Prob (F-statistic):           7.12e-15
Time:                        21:20:40   Log-Likelihood:                -198.49
No. Observations:                 250   AIC:                             401.0
Df Residuals:                     248   BIC:                             408.0
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.3661      0.053      6.897      0.000       0.262       0.471
PCx5          -1.6553      0.200     -8.292      0.000      -2.049      -1.262
==============================================================================
Omnibus:                      136.483   Durbin-Watson:                   0.032
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               15.637
Skew:                          -0.088   Prob(JB):                     0.000402
Kurtosis:                       1.788   Cond. No.                         6.13
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
RMSE in-sample = 0.58
RMSE out-of-sample = 0.72
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y2   R-squared:                       0.077
Model:                            OLS   Adj. R-squared:                  0.073
Method:                 Least Squares   F-statistic:                     20.72
Date:                Mon, 27 Aug 2018   Prob (F-statistic):           8.35e-06
Time:                        21:20:41   Log-Likelihood:                -219.05
No. Observations:                 250   AIC:                             442.1
Df Residuals:                     248   BIC:                             449.1
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.6509      0.154     -4.236      0.000      -0.954      -0.348
PCx6          -1.7574      0.386     -4.552      0.000      -2.518      -0.997
==============================================================================
Omnibus:                      184.841   Durbin-Watson:                   0.017
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               17.874
Skew:                          -0.183   Prob(JB):                     0.000131
Kurtosis:                       1.742   Cond. No.                         12.0
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
RMSE in-sample = 0.18
RMSE out-of-sample = 0.30
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y3   R-squared:                       0.877
Model:                            OLS   Adj. R-squared:                  0.877
Method:                 Least Squares   F-statistic:                     1772.
Date:                Mon, 27 Aug 2018   Prob (F-statistic):          6.19e-115
Time:                        21:20:41   Log-Likelihood:                 67.342
No. Observations:                 250   AIC:                            -130.7
Df Residuals:                     248   BIC:                            -123.6
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.7268      0.017     41.877      0.000       0.693       0.761
PCx1           1.2643      0.030     42.090      0.000       1.205       1.324
==============================================================================
Omnibus:                        4.831   Durbin-Watson:                   0.125
Prob(Omnibus):                  0.089   Jarque-Bera (JB):                4.802
Skew:                          -0.304   Prob(JB):                       0.0906
Kurtosis:                       2.698   Cond. No.                         3.09
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
RMSE in-sample = 0.21
RMSE out-of-sample = 0.63
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y3   R-squared:                       0.838
Model:                            OLS   Adj. R-squared:                  0.837
Method:                 Least Squares   F-statistic:                     1284.
Date:                Mon, 27 Aug 2018   Prob (F-statistic):          4.81e-100
Time:                        21:20:42   Log-Likelihood:                 32.802
No. Observations:                 250   AIC:                            -61.60
Df Residuals:                     248   BIC:                            -54.56
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.9233      0.025     37.634      0.000       0.875       0.972
PCx2          -0.8594      0.024    -35.833      0.000      -0.907      -0.812
==============================================================================
Omnibus:                        8.906   Durbin-Watson:                   0.091
Prob(Omnibus):                  0.012   Jarque-Bera (JB):                9.189
Skew:                          -0.383   Prob(JB):                       0.0101
Kurtosis:                       3.543   Cond. No.                         3.34
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
RMSE in-sample = 0.38
RMSE out-of-sample = 0.61
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y3   R-squared:                       0.488
Model:                            OLS   Adj. R-squared:                  0.486
Method:                 Least Squares   F-statistic:                     236.7
Date:                Mon, 27 Aug 2018   Prob (F-statistic):           5.98e-38
Time:                        21:20:43   Log-Likelihood:                -111.06
No. Observations:                 250   AIC:                             226.1
Df Residuals:                     248   BIC:                             233.2
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.8444      0.071    -11.843      0.000      -0.985      -0.704
PCx3          -1.1381      0.074    -15.384      0.000      -1.284      -0.992
==============================================================================
Omnibus:                       15.455   Durbin-Watson:                   0.058
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               15.306
Skew:                          -0.558   Prob(JB):                     0.000475
Kurtosis:                       2.526   Cond. No.                         5.78
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
RMSE in-sample = 0.18
RMSE out-of-sample = 0.33
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y3   R-squared:                       0.887
Model:                            OLS   Adj. R-squared:                  0.887
Method:                 Least Squares   F-statistic:                     1954.
Date:                Mon, 27 Aug 2018   Prob (F-statistic):          1.35e-119
Time:                        21:20:43   Log-Likelihood:                 78.154
No. Observations:                 250   AIC:                            -152.3
Df Residuals:                     248   BIC:                            -145.3
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0932      0.011      8.142      0.000       0.071       0.116
PCx4          -0.5423      0.012    -44.205      0.000      -0.566      -0.518
==============================================================================
Omnibus:                        9.175   Durbin-Watson:                   0.102
Prob(Omnibus):                  0.010   Jarque-Bera (JB):                9.454
Skew:                          -0.476   Prob(JB):                      0.00885
Kurtosis:                       3.020   Cond. No.                         1.23
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
RMSE in-sample = 0.45
RMSE out-of-sample = 0.98
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y3   R-squared:                       0.258
Model:                            OLS   Adj. R-squared:                  0.255
Method:                 Least Squares   F-statistic:                     86.22
Date:                Mon, 27 Aug 2018   Prob (F-statistic):           8.42e-18
Time:                        21:20:44   Log-Likelihood:                -157.51
No. Observations:                 250   AIC:                             319.0
Df Residuals:                     248   BIC:                             326.1
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.5100      0.045     11.320      0.000       0.421       0.599
PCx5          -1.5735      0.169     -9.286      0.000      -1.907      -1.240
==============================================================================
Omnibus:                      165.665   Durbin-Watson:                   0.044
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               16.456
Skew:                          -0.094   Prob(JB):                     0.000267
Kurtosis:                       1.757   Cond. No.                         6.13
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
RMSE in-sample = 0.52
RMSE out-of-sample = 0.48
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y3   R-squared:                       0.045
Model:                            OLS   Adj. R-squared:                  0.041
Method:                 Least Squares   F-statistic:                     11.63
Date:                Mon, 27 Aug 2018   Prob (F-statistic):           0.000757
Time:                        21:20:44   Log-Likelihood:                -189.08
No. Observations:                 250   AIC:                             382.2
Df Residuals:                     248   BIC:                             389.2
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.2626      0.136     -1.927      0.055      -0.531       0.006
PCx6          -1.1681      0.342     -3.411      0.001      -1.843      -0.494
==============================================================================
Omnibus:                     1941.970   Durbin-Watson:                   0.020
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               23.856
Skew:                          -0.240   Prob(JB):                     6.60e-06
Kurtosis:                       1.565   Cond. No.                         12.0
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
RMSE in-sample = 0.11
RMSE out-of-sample = 0.40
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y4   R-squared:                       0.938
Model:                            OLS   Adj. R-squared:                  0.938
Method:                 Least Squares   F-statistic:                     3744.
Date:                Mon, 27 Aug 2018   Prob (F-statistic):          1.21e-151
Time:                        21:20:45   Log-Likelihood:                 186.38
No. Observations:                 250   AIC:                            -368.8
Df Residuals:                     248   BIC:                            -361.7
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.2420      0.011     22.446      0.000       0.221       0.263
PCx1           1.1417      0.019     61.187      0.000       1.105       1.178
==============================================================================
Omnibus:                        8.372   Durbin-Watson:                   0.295
Prob(Omnibus):                  0.015   Jarque-Bera (JB):               14.770
Skew:                           0.082   Prob(JB):                     0.000620
Kurtosis:                       4.179   Cond. No.                         3.09
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
RMSE in-sample = 0.13
RMSE out-of-sample = 0.73
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y4   R-squared:                       0.922
Model:                            OLS   Adj. R-squared:                  0.922
Method:                 Least Squares   F-statistic:                     2940.
Date:                Mon, 27 Aug 2018   Prob (F-statistic):          1.55e-139
Time:                        21:20:46   Log-Likelihood:                 158.29
No. Observations:                 250   AIC:                            -312.6
Df Residuals:                     248   BIC:                            -305.5
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.4291      0.015     28.890      0.000       0.400       0.458
PCx2          -0.7873      0.015    -54.226      0.000      -0.816      -0.759
==============================================================================
Omnibus:                      134.091   Durbin-Watson:                   0.221
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1006.309
Skew:                          -2.014   Prob(JB):                    3.04e-219
Kurtosis:                      11.965   Cond. No.                         3.34
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
RMSE in-sample = 0.27
RMSE out-of-sample = 0.78
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y4   R-squared:                       0.648
Model:                            OLS   Adj. R-squared:                  0.646
Method:                 Least Squares   F-statistic:                     456.4
Date:                Mon, 27 Aug 2018   Prob (F-statistic):           3.81e-58
Time:                        21:20:46   Log-Likelihood:                -30.460
No. Observations:                 250   AIC:                             64.92
Df Residuals:                     248   BIC:                             71.96
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.2832      0.052    -24.843      0.000      -1.385      -1.181
PCx3          -1.1449      0.054    -21.363      0.000      -1.250      -1.039
==============================================================================
Omnibus:                       75.169   Durbin-Watson:                   0.106
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              151.556
Skew:                          -1.520   Prob(JB):                     1.23e-33
Kurtosis:                       5.303   Cond. No.                         5.78
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
RMSE in-sample = 0.12
RMSE out-of-sample = 0.43
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y4   R-squared:                       0.930
Model:                            OLS   Adj. R-squared:                  0.929
Method:                 Least Squares   F-statistic:                     3280.
Date:                Mon, 27 Aug 2018   Prob (F-statistic):          5.45e-145
Time:                        21:20:47   Log-Likelihood:                 170.94
No. Observations:                 250   AIC:                            -337.9
Df Residuals:                     248   BIC:                            -330.8
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.3293      0.008    -41.698      0.000      -0.345      -0.314
PCx4          -0.4847      0.008    -57.272      0.000      -0.501      -0.468
==============================================================================
Omnibus:                        4.083   Durbin-Watson:                   0.124
Prob(Omnibus):                  0.130   Jarque-Bera (JB):                4.140
Skew:                          -0.309   Prob(JB):                        0.126
Kurtosis:                       2.876   Cond. No.                         1.23
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
RMSE in-sample = 0.43
RMSE out-of-sample = 0.85
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y4   R-squared:                       0.117
Model:                            OLS   Adj. R-squared:                  0.114
Method:                 Least Squares   F-statistic:                     32.92
Date:                Mon, 27 Aug 2018   Prob (F-statistic):           2.79e-08
Time:                        21:20:47   Log-Likelihood:                -145.36
No. Observations:                 250   AIC:                             294.7
Df Residuals:                     248   BIC:                             301.8
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0548      0.043     -1.276      0.203      -0.139       0.030
PCx5          -0.9262      0.161     -5.738      0.000      -1.244      -0.608
==============================================================================
Omnibus:                      314.813   Durbin-Watson:                   0.035
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               19.610
Skew:                          -0.178   Prob(JB):                     5.52e-05
Kurtosis:                       1.675   Cond. No.                         6.13
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
RMSE in-sample = 0.42
RMSE out-of-sample = 0.69
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y4   R-squared:                       0.172
Model:                            OLS   Adj. R-squared:                  0.168
Method:                 Least Squares   F-statistic:                     51.38
Date:                Mon, 27 Aug 2018   Prob (F-statistic):           8.70e-12
Time:                        21:20:48   Log-Likelihood:                -137.41
No. Observations:                 250   AIC:                             278.8
Df Residuals:                     248   BIC:                             285.9
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.0153      0.111     -9.159      0.000      -1.234      -0.797
PCx6          -1.9966      0.279     -7.168      0.000      -2.545      -1.448
==============================================================================
Omnibus:                      165.249   Durbin-Watson:                   0.034
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               21.401
Skew:                          -0.362   Prob(JB):                     2.25e-05
Kurtosis:                       1.762   Cond. No.                         12.0
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
RMSE in-sample = 0.14
RMSE out-of-sample = 0.26
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y5   R-squared:                       0.919
Model:                            OLS   Adj. R-squared:                  0.919
Method:                 Least Squares   F-statistic:                     2815.
Date:                Mon, 27 Aug 2018   Prob (F-statistic):          2.22e-137
Time:                        21:20:49   Log-Likelihood:                 134.68
No. Observations:                 250   AIC:                            -265.4
Df Residuals:                     248   BIC:                            -258.3
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.1059      0.013      7.990      0.000       0.080       0.132
PCx1           1.2175      0.023     53.060      0.000       1.172       1.263
==============================================================================
Omnibus:                        0.033   Durbin-Watson:                   0.198
Prob(Omnibus):                  0.984   Jarque-Bera (JB):                0.098
Skew:                           0.025   Prob(JB):                        0.952
Kurtosis:                       2.917   Cond. No.                         3.09
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
RMSE in-sample = 0.10
RMSE out-of-sample = 0.43
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y5   R-squared:                       0.955
Model:                            OLS   Adj. R-squared:                  0.955
Method:                 Least Squares   F-statistic:                     5322.
Date:                Mon, 27 Aug 2018   Prob (F-statistic):          1.37e-169
Time:                        21:20:49   Log-Likelihood:                 209.42
No. Observations:                 250   AIC:                            -414.8
Df Residuals:                     248   BIC:                            -407.8
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.3257      0.012     26.907      0.000       0.302       0.350
PCx2          -0.8632      0.012    -72.953      0.000      -0.887      -0.840
==============================================================================
Omnibus:                      178.800   Durbin-Watson:                   0.339
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             3137.799
Skew:                          -2.564   Prob(JB):                         0.00
Kurtosis:                      19.581   Cond. No.                         3.34
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
RMSE in-sample = 0.27
RMSE out-of-sample = 0.52
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y5   R-squared:                       0.708
Model:                            OLS   Adj. R-squared:                  0.707
Method:                 Least Squares   F-statistic:                     601.4
Date:                Mon, 27 Aug 2018   Prob (F-statistic):           3.03e-68
Time:                        21:20:50   Log-Likelihood:                -25.667
No. Observations:                 250   AIC:                             55.33
Df Residuals:                     248   BIC:                             62.38
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.5826      0.051    -31.233      0.000      -1.682      -1.483
PCx3          -1.2893      0.053    -24.523      0.000      -1.393      -1.186
==============================================================================
Omnibus:                       67.785   Durbin-Watson:                   0.120
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              134.767
Skew:                          -1.365   Prob(JB):                     5.44e-30
Kurtosis:                       5.342   Cond. No.                         5.78
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
RMSE in-sample = 0.11
RMSE out-of-sample = 0.16
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y5   R-squared:                       0.947
Model:                            OLS   Adj. R-squared:                  0.947
Method:                 Least Squares   F-statistic:                     4448.
Date:                Mon, 27 Aug 2018   Prob (F-statistic):          2.15e-160
Time:                        21:20:50   Log-Likelihood:                 188.08
No. Observations:                 250   AIC:                            -372.2
Df Residuals:                     248   BIC:                            -365.1
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.5051      0.007    -68.495      0.000      -0.520      -0.491
PCx4          -0.5271      0.008    -66.694      0.000      -0.543      -0.512
==============================================================================
Omnibus:                        1.630   Durbin-Watson:                   0.195
Prob(Omnibus):                  0.443   Jarque-Bera (JB):                1.314
Skew:                          -0.148   Prob(JB):                        0.518
Kurtosis:                       3.198   Cond. No.                         1.23
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
RMSE in-sample = 0.47
RMSE out-of-sample = 0.57
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y5   R-squared:                       0.099
Model:                            OLS   Adj. R-squared:                  0.095
Method:                 Least Squares   F-statistic:                     27.28
Date:                Mon, 27 Aug 2018   Prob (F-statistic):           3.72e-07
Time:                        21:20:51   Log-Likelihood:                -166.51
No. Observations:                 250   AIC:                             337.0
Df Residuals:                     248   BIC:                             344.1
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.2249      0.047     -4.814      0.000      -0.317      -0.133
PCx5          -0.9175      0.176     -5.223      0.000      -1.263      -0.572
==============================================================================
Omnibus:                      353.934   Durbin-Watson:                   0.029
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               21.225
Skew:                          -0.250   Prob(JB):                     2.46e-05
Kurtosis:                       1.663   Cond. No.                         6.13
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
RMSE in-sample = 0.45
RMSE out-of-sample = 0.65
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y5   R-squared:                       0.191
Model:                            OLS   Adj. R-squared:                  0.188
Method:                 Least Squares   F-statistic:                     58.57
Date:                Mon, 27 Aug 2018   Prob (F-statistic):           4.35e-13
Time:                        21:20:52   Log-Likelihood:                -153.05
No. Observations:                 250   AIC:                             310.1
Df Residuals:                     248   BIC:                             317.1
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.2890      0.118    -10.923      0.000      -1.521      -1.057
PCx6          -2.2694      0.297     -7.653      0.000      -2.853      -1.685
==============================================================================
Omnibus:                      451.564   Durbin-Watson:                   0.032
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               21.743
Skew:                          -0.244   Prob(JB):                     1.90e-05
Kurtosis:                       1.640   Cond. No.                         12.0
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [548]:
trainResults = trainResultLinearModel
testResults = testResultLinearModel

To measure the quality of in- and out- of sample prediction, we can look at the in- and out- of sample RMSE (Root Mean Squared Error)

$$ RMSE = \sqrt{\frac{1}{w} \sum_{t=0}^{w-1} (y_t - \hat{y}_t)^2} $$

where $w=w_{train}$ (in-sample) $w=w_{test}$ (out-of-sample) and $$ \hat{y}_t = \alpha + \beta x_t $$ We see that:

  • in sample: PCx1,PCx2 and PCx4 are equally promising for all the y;
  • out of sample: only PCx1 and PCx4 are somehow consistent out of sample.
In [549]:
ax=trainResults.plot()
ax.set_xticks(range(len(trainResults.index.values)+1))
ax.set_xticklabels(trainResults.index.values)
plt.title('RMSE In sample')
plt.tight_layout()
plt.show()

trainResultLinearModel
Out[549]:
y1 y2 y3 y4 y5
LR_PCx1 0.152051 0.189995 0.184832 0.114813 0.141190
LR_PCx2 0.133582 0.216253 0.212217 0.128466 0.104705
LR_PCx3 0.299190 0.408789 0.377306 0.273323 0.268133
LR_PCx4 0.136021 0.189945 0.177009 0.122126 0.114034
LR_PCx5 0.496102 0.535281 0.454352 0.432800 0.470991
LR_PCx6 0.477755 0.581159 0.515503 0.419247 0.446309
In [550]:
ax=testResults.plot()
ax.set_xticks(range(len(testResults.index.values)+1))
ax.set_xticklabels(testResults.index.values)
plt.title('RMSE Out of sample')
plt.tight_layout()
plt.show()

testResultLinearModel
Out[550]:
y1 y2 y3 y4 y5
LR_PCx1 0.437942 0.367888 0.302783 0.398135 0.257347
LR_PCx2 0.425370 0.338904 0.625755 0.732607 0.426990
LR_PCx3 0.661684 0.540318 0.608685 0.778618 0.519273
LR_PCx4 0.287331 0.286109 0.328546 0.430902 0.161760
LR_PCx5 0.532020 0.675560 0.977715 0.848494 0.566461
LR_PCx6 0.915018 0.716783 0.475248 0.685394 0.653452

Considering 'y4' as example, I would like to understand why 'PCx4' out-performs 'PCx2' out-of-sample $$ RMSE_{PCx4,test}=0.43 < RMSE_{PCx2,test}= 0.73 $$ given their similar performance in-sample $$ RMSE_{PCx4,train}=0.12 \simeq RMSE_{PCx2,test}= 0.13 $$

We can re-run the analysis to compare the two cases. Looking at the in- and out- of sample correlations, we see that PCx4 keeps a high degree of (anti-) correlation with y4 also in the test part (-0.96 and -0.85, respectively), while PCx2 becomes less correlated (from -0.96 to -0.37).

In [551]:
# loop over the y to be predicted
for y_i in ['y4']:
    
    # loop over the singel PCs used as predictors in a linear regression model
    for PCx_j in ['PCx2','PCx4']:
        
        # retrieve test series for training and prediction
        x_train, x_test, y_train, y_test = setTrainAndTestSeries(x=df_PCx,y=df_y,
                                                                 col_x=PCx_j,col_y=y_i)
        
        print('[train] corr('+PCx_j+','+y_i+') = {}'.format(pd.concat([x_train, y_train], axis=1).corr().iloc[0,1]))
        print('[test] corr('+PCx_j+','+y_i+') = {}'.format(pd.concat([x_test, y_test], axis=1).corr().iloc[0,1]))
        
        # run the linear regression and retrieve predicted y (both in- and out- of sample)
        linearModelFitted, y_train_pred, y_test_pred, RMSE_train, RMSE_test = runLinearModel(x_train, x_test, 
                                                                                             y_train, y_test, 
                                                                                             IO=True)
        
        # plot model predictions (in- and out- of sample, for comparison)
        plotModel(x_train, x_test, y_train, y_test, y_train_pred, y_test_pred)
[train] corr(PCx2,y4) = -0.9603230526840629
[test] corr(PCx2,y4) = -0.3735046662282994
RMSE in-sample = 0.13
RMSE out-of-sample = 0.73
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y4   R-squared:                       0.922
Model:                            OLS   Adj. R-squared:                  0.922
Method:                 Least Squares   F-statistic:                     2940.
Date:                Mon, 27 Aug 2018   Prob (F-statistic):          1.55e-139
Time:                        21:23:42   Log-Likelihood:                 158.29
No. Observations:                 250   AIC:                            -312.6
Df Residuals:                     248   BIC:                            -305.5
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.4291      0.015     28.890      0.000       0.400       0.458
PCx2          -0.7873      0.015    -54.226      0.000      -0.816      -0.759
==============================================================================
Omnibus:                      134.091   Durbin-Watson:                   0.221
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1006.309
Skew:                          -2.014   Prob(JB):                    3.04e-219
Kurtosis:                      11.965   Cond. No.                         3.34
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[train] corr(PCx4,y4) = -0.9642135015109609
[test] corr(PCx4,y4) = -0.8541526207475868
RMSE in-sample = 0.12
RMSE out-of-sample = 0.43
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y4   R-squared:                       0.930
Model:                            OLS   Adj. R-squared:                  0.929
Method:                 Least Squares   F-statistic:                     3280.
Date:                Mon, 27 Aug 2018   Prob (F-statistic):          5.45e-145
Time:                        21:23:42   Log-Likelihood:                 170.94
No. Observations:                 250   AIC:                            -337.9
Df Residuals:                     248   BIC:                            -330.8
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.3293      0.008    -41.698      0.000      -0.345      -0.314
PCx4          -0.4847      0.008    -57.272      0.000      -0.501      -0.468
==============================================================================
Omnibus:                        4.083   Durbin-Watson:                   0.124
Prob(Omnibus):                  0.130   Jarque-Bera (JB):                4.140
Skew:                          -0.309   Prob(JB):                        0.126
Kurtosis:                       2.876   Cond. No.                         1.23
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

So, we see that, a change in the correlation regime between the train- and test- periods likely drives a deterioration of the prediction.

Intuitively, the presence of jumps in the time series, makes a model with a single $(\alpha,\beta)$, highly un-likely to be predictive.

Indeed, we can analyze the effect of jumps on rolling estimates of intercept and slope. The folliwing run the linear model over rollowing windows: $$ [T_{train}-w_{train}-500:T_{train}-500], [T_{train}-w_{train}-499:T_{train}-499], \dots, [T_{train}-w_{train}:T_{train}] $$

In [286]:
rolling_w = 500
alpha = []
beta = []

for roll_w in np.arange(rolling_w)[::-1]:

    # retrieve test series for training and prediction
    x_train, x_test, y_train, y_test = setTrainAndTestSeries(x=df_PCx,y=df_y,
                                                             offset_train=roll_w,
                                                             col_x='PCx1',col_y='y1')

    # run the linear regression and retrieve predicted y (both in- and out- of sample)
    linearModelFitted, _, _, _, _ = runLinearModel(x_train, x_test, 
                                                   y_train, y_test, 
                                                   IO=False)
    
    # rolling intercept and slope values
    alpha.append(linearModelFitted.params[0])
    beta.append(linearModelFitted.params[1])
In [289]:
x, _, y, _ = setTrainAndTestSeries(x=df_PCx,y=df_y,
                                   w_train = rolling_w,
                                   offset_train=0,
                                   col_x='PCx1',col_y='y1')

fig, ax1 = plt.subplots(figsize=(9,6))

idx = x.index

ax1.plot(idx, x.values, 'r-', label='PCx1')
ax1.plot(idx, y.values, 'b-', label='y1')
ax1.tick_params('y', colors='k')
plt.legend()
ax2 = ax1.twinx()
ax2.plot(idx, alpha, 'g-', label='alpha')
ax2.plot(idx, beta, 'm-', label='beta')
ax2.tick_params('y', colors='k')

plt.legend()
fig.tight_layout()

plt.show()

This calls for a model with $(\alpha, \beta)$:

  • time-dependent
  • with a known dynamics that can be forecasted

spoiler: filter $(\alpha_t, \beta_t)$ as the latent state of a Kalman filter...

Another evidence of the need of a model with a richer dynamics is the fact that increasing the length of the window over which we train the model, $w_{train} = 250 \rightarrow w_{train} = 1000$, actually increases the $RMSE_{train}$, making even less reliable the model out-of-sample.

We test this over the best predictive model so far: $$ y_{5,t} = \alpha + \beta \cdot PCx_{4,t} + \epsilon $$ We keep $w_{test} = 250$.

In [552]:
results = {}
results['Linear regression'] = {}
results['Linear regression']['Train'] = {}
results['Linear regression']['Test'] = {}

increasing_w_train = np.arange(250,1010,10)

# loop over increasing w_train windows
for w in increasing_w_train:

    # loop over the y to be predicted
    for y_i in ['y5']:

        # loop over the singel PCs used as predictors in a linear regression model
        for PCx_j in ['PCx4']:

            # retrieve test series for training and prediction
            x_train, x_test, y_train, y_test = setTrainAndTestSeries(x=df_PCx,y=df_y,
                                                                     w_train=w,
                                                                     col_x=PCx_j,col_y=y_i)

            # run the linear regression and retrieve predicted y (both in- and out- of sample)
            linearModelFitted, y_train_pred, y_test_pred, RMSE_train, RMSE_test = runLinearModel(x_train, x_test, 
                                                                                                 y_train, y_test, 
                                                                                                 IO=False)

            # plot model predictions (in- and out- of sample, for comparison)
            #plotModel(x_train, x_test, y_train, y_test, y_train_pred, y_test_pred)

            results['Linear regression']['Train'][w] = RMSE_train
            results['Linear regression']['Test'][w] = RMSE_test

trainResult_Wtrain = pd.Series(results['Linear regression']['Train'], index=increasing_w_train)
testResult_Wtrain = pd.Series(results['Linear regression']['Test'], index=increasing_w_train)
In [553]:
ax=trainResult_Wtrain.plot()
plt.title('RMSE In sample')
plt.xlabel('$w_{train}$')
plt.tight_layout()
plt.show()
In [554]:
ax=testResult_Wtrain.plot()
plt.title('RMSE out sample')
plt.xlabel('$w_{test}$')
plt.tight_layout()
plt.show()

From these plots, it is clear that between 300 and 400, the in-sample RMSE increases abruptly and, at 400, the out-of-sample RMSE increases a lot as well. Let's see some plots (250,300,350,400,450). The problem seems to be amount of jumps included/excluded in the window $[T_{train}-w_{train}:T_{train}]$.

In [339]:
increasing_w_train = [250,300,350,400,450]

# loop over increasing w_train windows
for w in increasing_w_train:

    # loop over the y to be predicted
    for y_i in ['y5']:

        # loop over the singel PCs used as predictors in a linear regression model
        for PCx_j in ['PCx4']:

            # retrieve test series for training and prediction
            x_train, x_test, y_train, y_test = setTrainAndTestSeries(x=df_PCx,y=df_y,
                                                                     w_train=w,
                                                                     col_x=PCx_j,col_y=y_i)

            # run the linear regression and retrieve predicted y (both in- and out- of sample)
            linearModelFitted, y_train_pred, y_test_pred, RMSE_train, RMSE_test = runLinearModel(x_train, x_test, 
                                                                                                 y_train, y_test, 
                                                                                                 IO=False)
            
            print('w_train = {0:2d}, [RMSE_train = {1:.2f}; RMSE_test = {2:.2f}], alpha = {3:.2f}, beta = {4:.2f}'.format(w,
                                                                                                       RMSE_train, RMSE_test,
                                                                                                       linearModelFitted.params[0], 
                                                                                                       linearModelFitted.params[1]))

            # plot model predictions (in- and out- of sample, for comparison)
            plotModel(x_train, x_test, y_train, y_test, y_train_pred, y_test_pred,' ($w_{train} ='+str(w)+'$)',plotTest=True)
w_train = 250, [RMSE_train = 0.11; RMSE_test = 0.16], alpha = -0.51, beta = -0.53
w_train = 300, [RMSE_train = 0.13; RMSE_test = 0.17], alpha = -0.50, beta = -0.53
w_train = 350, [RMSE_train = 0.24; RMSE_test = 0.18], alpha = -0.42, beta = -0.48
w_train = 400, [RMSE_train = 0.32; RMSE_test = 0.24], alpha = -0.33, beta = -0.47
w_train = 450, [RMSE_train = 0.34; RMSE_test = 0.32], alpha = -0.28, beta = -0.54

 Model 2: Multiple linear regression

Before revert to time-dependent $(\alpha, \beta)$, I try to understand whether all the PCs together, have a better predicting power. Therefore I fit the following multiple linear regression model:

$$ y_t = \alpha + \sum_{j=1}^6 \beta_j \cdot PCx_{j,t} + \epsilon $$ performed for each $y_{i,t}$, $i=1,...,5$ over windows defined as before $[T_{train}-w_{train}:T_{train}]$ and tested over $[T_{train}:T_{train}+w_{test}]$ (default $w_{train}=w_{test}=250$).

In [555]:
def runMultipleLinearRegressionModel(x_train, x_test, y_train, y_test, IO=False):
    
    # Running the multiple linear regression
    multipleLinearModelFit = regression.linear_model.OLS(y_train, sm.add_constant(x_train)).fit()
    
    y_train_pred = multipleLinearModelFit.params[0] + x_train.mul(multipleLinearModelFit.params[1:]).sum(axis=1)
    y_train_pred.name = y_train.name + '_pred [trained]'

    y_test_pred = multipleLinearModelFit.params[0] + x_test.mul(multipleLinearModelFit.params[1:]).sum(axis=1)
    y_test_pred.name = y_test.name + '_pred [test]'
    
    RMSE_train = np.sqrt(np.mean((y_train_pred.values - y_train.values) ** 2))
    RMSE_test = np.sqrt(np.mean((y_test_pred.values - y_test.values) ** 2))
 
    if IO:
        print('RMSE in-sample = {0:.2f}'.format(RMSE_train))
        print('RMSE out-of-sample = {0:.2f}'.format(RMSE_test))
        print(multipleLinearModelFit.summary())

    return multipleLinearModelFit, y_train_pred, y_test_pred, RMSE_train, RMSE_test
In [557]:
results = {}
results['Multiple Linear Regression'] = {}
results['Multiple Linear Regression']['Train'] = {}
results['Multiple Linear Regression']['Test'] = {}

# loop over the y to be predicted
for y_i in df_y.columns:
    
    results['Multiple Linear Regression']['Train'][y_i] = {} 
    results['Multiple Linear Regression']['Test'][y_i] = {} 

    # retrieve test series for training and prediction
    x_train, x_test, y_train, y_test = setTrainAndTestSeries(x=df_PCx,y=df_y, col_y=y_i)

    # run the multiple linear regression and retrieve predicted y (both in- and out- of sample)
    multipleLinearModelFitted, y_train_pred, y_test_pred, RMSE_train, RMSE_test = runMultipleLinearRegressionModel(x_train, x_test, 
                                                                                                                   y_train, y_test, 
                                                                                                                   IO=True)

    # plot model predictions (in- and out- of sample, for comparison)
    plotModel(x_train, x_test, y_train, y_test, y_train_pred, y_test_pred, plot_x=False)

    results['Multiple Linear Regression']['Train'][y_i] = RMSE_train
    results['Multiple Linear Regression']['Test'][y_i] = RMSE_test

trainResultMultipleLinearModel = pd.Series(results['Multiple Linear Regression']['Train'])
testResultMultipleLinearModel = pd.Series(results['Multiple Linear Regression']['Test'])
trainResultMultipleLinearModel.name='MLR'
testResultMultipleLinearModel.name='MLR'
RMSE in-sample = 0.07
RMSE out-of-sample = 0.29
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y1   R-squared:                       0.982
Model:                            OLS   Adj. R-squared:                  0.982
Method:                 Least Squares   F-statistic:                     2240.
Date:                Mon, 27 Aug 2018   Prob (F-statistic):          1.49e-209
Time:                        21:25:31   Log-Likelihood:                 309.78
No. Observations:                 250   AIC:                            -605.6
Df Residuals:                     243   BIC:                            -580.9
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0906      0.510     -0.178      0.859      -1.095       0.913
PCx1           0.4844      0.457      1.061      0.290      -0.415       1.384
PCx2          -0.0227      0.559     -0.041      0.968      -1.123       1.078
PCx3          -0.0584      0.722     -0.081      0.936      -1.480       1.364
PCx4          -0.3224      0.045     -7.202      0.000      -0.411      -0.234
PCx5          -0.1671      0.340     -0.491      0.624      -0.838       0.504
PCx6          -0.3171      1.362     -0.233      0.816      -3.001       2.367
==============================================================================
Omnibus:                        9.812   Durbin-Watson:                   0.044
Prob(Omnibus):                  0.007   Jarque-Bera (JB):               10.150
Skew:                          -0.469   Prob(JB):                      0.00625
Kurtosis:                       2.693   Cond. No.                         689.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
RMSE in-sample = 0.08
RMSE out-of-sample = 0.56
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y2   R-squared:                       0.984
Model:                            OLS   Adj. R-squared:                  0.984
Method:                 Least Squares   F-statistic:                     2510.
Date:                Mon, 27 Aug 2018   Prob (F-statistic):          1.84e-215
Time:                        21:25:31   Log-Likelihood:                 288.77
No. Observations:                 250   AIC:                            -563.5
Df Residuals:                     243   BIC:                            -538.9
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          6.9394      0.554     12.518      0.000       5.847       8.031
PCx1          -5.3989      0.497    -10.869      0.000      -6.377      -4.420
PCx2          -7.4554      0.608    -12.266      0.000      -8.653      -6.258
PCx3           9.5760      0.785     12.195      0.000       8.029      11.123
PCx4          -0.1513      0.049     -3.107      0.002      -0.247      -0.055
PCx5          -4.7102      0.370    -12.720      0.000      -5.440      -3.981
PCx6         -17.5809      1.482    -11.864      0.000     -20.500     -14.662
==============================================================================
Omnibus:                        6.938   Durbin-Watson:                   0.167
Prob(Omnibus):                  0.031   Jarque-Bera (JB):                4.865
Skew:                           0.205   Prob(JB):                       0.0878
Kurtosis:                       2.454   Cond. No.                         689.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
RMSE in-sample = 0.08
RMSE out-of-sample = 0.42
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y3   R-squared:                       0.977
Model:                            OLS   Adj. R-squared:                  0.976
Method:                 Least Squares   F-statistic:                     1707.
Date:                Mon, 27 Aug 2018   Prob (F-statistic):          1.63e-195
Time:                        21:25:32   Log-Likelihood:                 275.77
No. Observations:                 250   AIC:                            -537.5
Df Residuals:                     243   BIC:                            -512.9
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.3041      0.584     -2.233      0.026      -2.454      -0.154
PCx1           2.7494      0.523      5.255      0.000       1.719       3.780
PCx2           2.4523      0.640      3.830      0.000       1.191       3.714
PCx3          -3.3699      0.827     -4.074      0.000      -4.999      -1.741
PCx4          -0.3068      0.051     -5.982      0.000      -0.408      -0.206
PCx5           1.2711      0.390      3.259      0.001       0.503       2.039
PCx6           7.2635      1.561      4.653      0.000       4.189      10.338
==============================================================================
Omnibus:                       13.780   Durbin-Watson:                   0.094
Prob(Omnibus):                  0.001   Jarque-Bera (JB):               12.763
Skew:                          -0.491   Prob(JB):                      0.00169
Kurtosis:                       2.491   Cond. No.                         689.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
RMSE in-sample = 0.04
RMSE out-of-sample = 0.38
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y4   R-squared:                       0.991
Model:                            OLS   Adj. R-squared:                  0.990
Method:                 Least Squares   F-statistic:                     4312.
Date:                Mon, 27 Aug 2018   Prob (F-statistic):          1.17e-243
Time:                        21:25:33   Log-Likelihood:                 423.71
No. Observations:                 250   AIC:                            -833.4
Df Residuals:                     243   BIC:                            -808.8
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.1436      0.323      3.539      0.000       0.507       1.780
PCx1          -0.4082      0.290     -1.410      0.160      -0.978       0.162
PCx2          -1.3147      0.354     -3.711      0.000      -2.013      -0.617
PCx3           1.7254      0.458      3.770      0.000       0.824       2.627
PCx4          -0.2091      0.028     -7.369      0.000      -0.265      -0.153
PCx5          -0.6874      0.216     -3.185      0.002      -1.113      -0.262
PCx6          -3.1902      0.864     -3.693      0.000      -4.892      -1.489
==============================================================================
Omnibus:                        1.323   Durbin-Watson:                   0.093
Prob(Omnibus):                  0.516   Jarque-Bera (JB):                1.013
Skew:                          -0.097   Prob(JB):                        0.603
Kurtosis:                       3.244   Cond. No.                         689.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
RMSE in-sample = 0.04
RMSE out-of-sample = 0.09
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                     y5   R-squared:                       0.995
Model:                            OLS   Adj. R-squared:                  0.995
Method:                 Least Squares   F-statistic:                     7519.
Date:                Mon, 27 Aug 2018   Prob (F-statistic):          8.82e-273
Time:                        21:25:33   Log-Likelihood:                 474.10
No. Observations:                 250   AIC:                            -934.2
Df Residuals:                     243   BIC:                            -909.5
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.0468      0.264     -3.963      0.000      -1.567      -0.526
PCx1           0.6987      0.237      2.952      0.003       0.233       1.165
PCx2           0.4969      0.290      1.716      0.087      -0.074       1.067
PCx3          -0.7059      0.374     -1.887      0.060      -1.443       0.031
PCx4          -0.3714      0.023    -16.008      0.000      -0.417      -0.326
PCx5          -0.0359      0.176     -0.204      0.839      -0.383       0.312
PCx6           0.4962      0.706      0.703      0.483      -0.895       1.887
==============================================================================
Omnibus:                       19.974   Durbin-Watson:                   0.078
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               20.184
Skew:                           0.647   Prob(JB):                     4.14e-05
Kurtosis:                       2.485   Cond. No.                         689.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

As expected, despite the in-sample improvement, the MLR model doesn't outperform the single linear model in terms of predicting power. Moreover, usually all MLR betas are not significant, except the one linked to the best one-factor linear model.

In [558]:
trainResults = trainResultLinearModel.append(trainResultMultipleLinearModel)
testResults = testResultLinearModel.append(testResultMultipleLinearModel)
In [559]:
ax=trainResults.plot()
ax.set_xticks(range(len(trainResults.index.values)+1))
ax.set_xticklabels(trainResults.index.values)
plt.title('RMSE In sample')
plt.tight_layout()
plt.show()

trainResults
Out[559]:
y1 y2 y3 y4 y5
LR_PCx1 0.152051 0.189995 0.184832 0.114813 0.141190
LR_PCx2 0.133582 0.216253 0.212217 0.128466 0.104705
LR_PCx3 0.299190 0.408789 0.377306 0.273323 0.268133
LR_PCx4 0.136021 0.189945 0.177009 0.122126 0.114034
LR_PCx5 0.496102 0.535281 0.454352 0.432800 0.470991
LR_PCx6 0.477755 0.581159 0.515503 0.419247 0.446309
MLR 0.070084 0.076230 0.080298 0.044433 0.036322
In [560]:
ax=testResults.plot()
ax.set_xticks(range(len(testResults.index.values)))
ax.set_xticklabels(testResults.index.values)
plt.title('RMSE Out of sample')
plt.tight_layout()
plt.show()

testResults
Out[560]:
y1 y2 y3 y4 y5
LR_PCx1 0.437942 0.367888 0.302783 0.398135 0.257347
LR_PCx2 0.425370 0.338904 0.625755 0.732607 0.426990
LR_PCx3 0.661684 0.540318 0.608685 0.778618 0.519273
LR_PCx4 0.287331 0.286109 0.328546 0.430902 0.161760
LR_PCx5 0.532020 0.675560 0.977715 0.848494 0.566461
LR_PCx6 0.915018 0.716783 0.475248 0.685394 0.653452
MLR 0.289490 0.562417 0.423087 0.375912 0.087440

Model 3: Lasso linear regression

Before introducing the Kalman filter approach, let's forget for a moment the PCs time series and get back to the original Xs. As a model selection approach, the LASSO automatically selects the relevant beta. The loss minimized is ($N=70000$ in our sample):

$$ \frac{1}{2N} || y - \sum_{k=1}^{56} \beta_k x_{k} ||^2_2 + \lambda \sum_{k=1}^{56} |\beta_k| $$ performed over windows defined as before $[T_{train}-w_{train}:T_{train}]$ and tested over $[T_{train}:T_{train}+w_{test}]$ (default $w_{train}=w_{test}=250$).

I decided to use the LassoLarsCV version of the Lasso algorithm which performs automatically a cross-validation to infer the optimal weight $\lambda$.

In [561]:
def runLassoRegressionModel(x_train, x_test, y_train, y_test, IO=False):
    
    # Running the LASSO  regression
    lassoModelFit = LassoLarsCV(cv=10).fit(x_train,y_train)
    
    y_train_pred = pd.Series(data=lassoModelFit.predict(x_train), index=y_train.index)
    y_train_pred.name = y_train.name + '_pred [trained]'

    y_test_pred = pd.Series(data=lassoModelFit.predict(x_test), index=y_test.index)
    y_test_pred.name = y_test.name + '_pred [test]'
    
    RMSE_train = np.sqrt(np.mean((y_train_pred.values - y_train.values) ** 2))
    RMSE_test = np.sqrt(np.mean((y_test_pred.values - y_test.values) ** 2))
 
    if IO:
        print('RMSE in-sample = {0:.2f}'.format(RMSE_train))
        print('RMSE out-of-sample = {0:.2f}'.format(RMSE_test))

    return lassoModelFit, y_train_pred, y_test_pred, RMSE_train, RMSE_test
In [562]:
warnings.filterwarnings("ignore")

results = {}
results['Lasso Regression'] = {}
results['Lasso Regression']['Train'] = {}
results['Lasso Regression']['Test'] = {}

# loop over the y to be predicted
for y_i in df_y.columns:
    
    results['Lasso Regression']['Train'][y_i] = {} 
    results['Lasso Regression']['Test'][y_i] = {} 

    # retrieve test series for training and prediction
    x_train, x_test, y_train, y_test = setTrainAndTestSeries(x=df_x,y=df_y, col_y=y_i)

    # run the multiple linear regression and retrieve predicted y (both in- and out- of sample)
    lassoModelFitted, y_train_pred, y_test_pred, RMSE_train, RMSE_test = runLassoRegressionModel(x_train, x_test, 
                                                                                                 y_train, y_test, 
                                                                                                 IO=True)

    # plot model predictions (in- and out- of sample, for comparison)
    plotModel(x_train, x_test, y_train, y_test, y_train_pred, y_test_pred, plot_x=False)

    results['Lasso Regression']['Train'][y_i] = RMSE_train
    results['Lasso Regression']['Test'][y_i] = RMSE_test

trainResultLassoModel = pd.Series(results['Lasso Regression']['Train'])
testResultLassoModel = pd.Series(results['Lasso Regression']['Test'])
trainResultLassoModel.name='Lasso'
testResultLassoModel.name='Lasso'
RMSE in-sample = 0.07
RMSE out-of-sample = 0.25
RMSE in-sample = 0.24
RMSE out-of-sample = 0.19
RMSE in-sample = 0.31
RMSE out-of-sample = 0.43
RMSE in-sample = 0.04
RMSE out-of-sample = 0.36
RMSE in-sample = 0.04
RMSE out-of-sample = 0.10

Results of the Lasso algorithm are not substantially different from those obtained with the MLR or even with the best single PCx model for the given y.

In [563]:
trainResults = trainResults.append(trainResultLassoModel)
testResults = testResults.append(testResultLassoModel)
In [564]:
ax=trainResults.plot()
ax.set_xticks(range(len(trainResults.index.values)+1))
ax.set_xticklabels(trainResults.index.values)
plt.title('RMSE In sample')
plt.tight_layout()
plt.show()

trainResults
Out[564]:
y1 y2 y3 y4 y5
LR_PCx1 0.152051 0.189995 0.184832 0.114813 0.141190
LR_PCx2 0.133582 0.216253 0.212217 0.128466 0.104705
LR_PCx3 0.299190 0.408789 0.377306 0.273323 0.268133
LR_PCx4 0.136021 0.189945 0.177009 0.122126 0.114034
LR_PCx5 0.496102 0.535281 0.454352 0.432800 0.470991
LR_PCx6 0.477755 0.581159 0.515503 0.419247 0.446309
MLR 0.070084 0.076230 0.080298 0.044433 0.036322
Lasso 0.068102 0.235692 0.310884 0.040744 0.036117
In [565]:
ax=testResults.plot()
ax.set_xticks(range(len(testResults.index.values)))
ax.set_xticklabels(testResults.index.values)
plt.title('RMSE Out of sample')
plt.tight_layout()
plt.show()

testResults
Out[565]:
y1 y2 y3 y4 y5
LR_PCx1 0.437942 0.367888 0.302783 0.398135 0.257347
LR_PCx2 0.425370 0.338904 0.625755 0.732607 0.426990
LR_PCx3 0.661684 0.540318 0.608685 0.778618 0.519273
LR_PCx4 0.287331 0.286109 0.328546 0.430902 0.161760
LR_PCx5 0.532020 0.675560 0.977715 0.848494 0.566461
LR_PCx6 0.915018 0.716783 0.475248 0.685394 0.653452
MLR 0.289490 0.562417 0.423087 0.375912 0.087440
Lasso 0.254450 0.191252 0.429687 0.361414 0.102481

The LASSO algorithm finds optimal to include 6 Xs as regressors.

Here is the path of the $\beta_k$, when varying the weight $\lambda$ (here, plotted up to the optimal $\lambda \simeq 8 \times 10^{-4}$ )

In [513]:
print('lambda={0:.4f}'.format(lassoModelFitted.alpha_))
lambda=0.0008
In [502]:
m_log_lambdas = -np.log10(lassoModelFitted.alphas_)

fig = plt.figure(figsize=(10,7))
ax = plt.gca()
plt.plot(m_log_lambdas, lassoModelFitted.coef_path_.T)
plt.axvline(-np.log10(lassoModelFitted.alpha_), linestyle='--', color='k', label='$\lambda$ 10-fold cross-validated')
plt.ylabel('Regression Coefficients')
plt.xlabel('$-\log(\lambda)$')
plt.title('Regression Coefficients Progression for Lasso Paths')
plt.legend()
plt.show()

Here I show the coefficients $\beta_k$ assigned to each $x_k$ $k=1, \dots, 56$ and below, the corresponding $x_k$ in the training set.

In [504]:
fig = plt.figure(figsize=(15,8))
pd.Series(data=lassoModelFitted.coef_, index=df_x.columns).plot('bar', color='r')
pd.Series(data=0.0*lassoModelFitted.coef_, index=df_x.columns).plot(color='k')
plt.show()
In [505]:
beta_k = pd.Series(data=lassoModelFitted.coef_, index=df_x.columns)
x_k = x_train.loc[:,beta_k[beta_k > 0].index]
x_k.plot(subplots=True, figsize=(7,5*6), legend=False, title=list(x_k.columns), color='r')
plt.show()

 Model 4: Kalman filter

We compute time-varying estimates of (alpha, beta) following:

Model 4.1: Kalman filter with initial guesses and prediction based on last state vector values

In [694]:
def runKalmanFilterModel(x_train, x_test, y_train, y_test, IO=False, optimized=False, online_prediction=False):

    # define the observation matrix when (beta, alpha) is the state vector
    obs_mat = np.expand_dims(np.vstack([[x_train], [np.ones(len(x_train))]]).T, axis=1)

    if optimized:
        kf = KalmanFilter(n_dim_obs=1, n_dim_state=2, # y is 1-dimensional, (alpha, beta) is 2-dimensional
                          transition_matrices=np.eye(2),
                          observation_matrices=obs_mat)
        
        # performs 10 EM algorithm iterations to optimize the covariances and initial guesses 
        # (the number of iterations is kept to 1 to avoid, as much as possible, overfitting of the training set)
        kf.em(y_train.values, n_iter=1, em_vars=['transition_covariance',
                                                  'observation_covariance',
                                                  'initial_state_mean', 
                                                  'initial_state_covariance'])
    else:
        
        # define the noises in observations
        delta = 1e-3
        trans_cov = delta / (1 - delta) * np.eye(2) # How much random walk wiggles
        
        kf = KalmanFilter(n_dim_obs=1, n_dim_state=2, # y is 1-dimensional, (alpha, beta) is 2-dimensional
                          initial_state_mean=[0,0],
                          initial_state_covariance=np.ones((2, 2)),
                          transition_matrices=np.eye(2),
                          observation_matrices=obs_mat,
                          observation_covariance=1.0,
                          transition_covariance=trans_cov)    
    
    # Use the observations y to get running estimates and errors for the state parameters
    state_means, state_covs = kf.filter(y_train.values)

    # filtered (beta, alpha) state 
    alpha = state_means[:,1]
    beta = state_means[:,0]
    
    y_train_pred = alpha + beta*x_train
    y_train_pred.name = y_train.name + '_pred [trained]'
    
    if online_prediction:
        
        y_test_pred = pd.Series(index=y_test.index)
        
        # initialize the state to be updated to the last filtered in training
        estimated_state_mean, estimated_state_covs = state_means[-1], state_covs[-1]
        
        # loop over the test set - BUT I'M USING ONLY THE X!
        for t in x_test.index:
            
            # we make a NOISY guess y_{t} | info up to t-1, given:
            # the estimate (beta, alpha)_{t-1}
            # x_t
            lag=5
            if t < lag:
                if t == 0:
                    y_t_guess = y_train.tail(lag).values.mean()
                else:
                    pass # y_t_guess not updated in the first lag steps
            else:
                y_t_guess = y_test[t-lag:t].mean() # Note well: y_t is not included in the averaging
            #y_t_guess = estimated_state_mean[1] + estimated_state_mean[0]*x_test[t] # exact KF forecast, tauthologic            
            #y_t_guess = estimated_state_mean[1] + estimated_state_mean[0]*x_test[t] + np.random.randn() # with same observation noise

            #print('y_t_guess = ', y_t_guess)
            #print('alpha_old = ', estimated_state_mean[1])
            #print('beta_old = ', estimated_state_mean[0])
            
            # Compute the estimate: (beta, alpha)_t of the state for time t, given:
            # the estimate (beta, alpha)_{t-1}
            # the guess: y_{t} | info up to t-1
            # x_t
            estimated_state_mean, estimated_state_covs = kf.filter_update(filtered_state_mean=estimated_state_mean, 
                                                                          filtered_state_covariance=estimated_state_covs,
                                                                          observation=y_t_guess,
                                                                          observation_matrix=np.array([[x_test[t],1.0]]))
            
            #print('alpha_new = ', estimated_state_mean[1])
            #print('beta_new = ', estimated_state_mean[0])
            
            # given x_t and the estimate (beta, alpha)_t just computed, 
            # make the prediction for y_t
            y_test_pred[t] = estimated_state_mean[1] + estimated_state_mean[0]*x_test[t]
            #print('y_test_pred = ', y_test_pred[t])
            
                                                      
    else:
    
        # for out-of-sample prediction use the last filtered value of the (beta, alpha) state
        y_test_pred = alpha[-1] + beta[-1]*x_test
    
    y_test_pred.name = y_test.name + '_pred [test]'
    
    RMSE_train = np.sqrt(np.mean((y_train_pred.values - y_train.values) ** 2))
    RMSE_test = np.sqrt(np.mean((y_test_pred.values - y_test.values) ** 2))
 
    if IO:
        print('RMSE in-sample = {0:.2f}'.format(RMSE_train))
        print('RMSE out-of-sample = {0:.2f}'.format(RMSE_test))

    return kf, y_train_pred, y_test_pred, RMSE_train, RMSE_test, alpha, beta
In [567]:
results = {}
results['Kalman Filter'] = {}
results['Kalman Filter']['Train'] = {}
results['Kalman Filter']['Test'] = {}

# loop over the y to be predicted
for y_i in df_y.columns:
    
    results['Kalman Filter']['Train'][y_i] = {} 
    results['Kalman Filter']['Test'][y_i] = {} 
    
    # loop over the singel PCs used as predictors in a linear regression model
    for PCx_j in df_PCx.columns:
        
        # retrieve test series for training and prediction
        x_train, x_test, y_train, y_test = setTrainAndTestSeries(x=df_PCx,y=df_y,
                                                                 col_x=PCx_j,col_y=y_i)
        
        # run the linear regression and retrieve predicted y (both in- and out- of sample)
        KFModelFitted, y_train_pred, y_test_pred, RMSE_train, RMSE_test, _, _ = runKalmanFilterModel(x_train, x_test, 
                                                                                                     y_train, y_test, 
                                                                                                     IO=True)
        
        # plot model predictions (in- and out- of sample, for comparison)
        plotModel(x_train, x_test, y_train, y_test, y_train_pred, y_test_pred)
        
        results['Kalman Filter']['Train'][y_i]['KF_'+PCx_j] = RMSE_train
        results['Kalman Filter']['Test'][y_i]['KF_'+PCx_j] = RMSE_test

trainResultKFModel = pd.DataFrame(results['Kalman Filter']['Train'])
testResultKFModel = pd.DataFrame(results['Kalman Filter']['Test'])
trainResultKFModel.name = 'KF'
testResultKFModel.name = 'KF'
RMSE in-sample = 0.29
RMSE out-of-sample = 0.33
RMSE in-sample = 0.18
RMSE out-of-sample = 0.45
RMSE in-sample = 0.19
RMSE out-of-sample = 0.58
RMSE in-sample = 0.12
RMSE out-of-sample = 0.23
RMSE in-sample = 0.27
RMSE out-of-sample = 0.50
RMSE in-sample = 0.24
RMSE out-of-sample = 0.46
RMSE in-sample = 0.27
RMSE out-of-sample = 0.27
RMSE in-sample = 0.21
RMSE out-of-sample = 0.52
RMSE in-sample = 0.22
RMSE out-of-sample = 0.57
RMSE in-sample = 0.15
RMSE out-of-sample = 0.26
RMSE in-sample = 0.29
RMSE out-of-sample = 0.59
RMSE in-sample = 0.27
RMSE out-of-sample = 0.42
RMSE in-sample = 0.20
RMSE out-of-sample = 0.52
RMSE in-sample = 0.19
RMSE out-of-sample = 0.76
RMSE in-sample = 0.19
RMSE out-of-sample = 0.77
RMSE in-sample = 0.13
RMSE out-of-sample = 0.38
RMSE in-sample = 0.25
RMSE out-of-sample = 0.79
RMSE in-sample = 0.24
RMSE out-of-sample = 0.67
RMSE in-sample = 0.29
RMSE out-of-sample = 0.54
RMSE in-sample = 0.15
RMSE out-of-sample = 0.77
RMSE in-sample = 0.18
RMSE out-of-sample = 0.85
RMSE in-sample = 0.10
RMSE out-of-sample = 0.47
RMSE in-sample = 0.25
RMSE out-of-sample = 0.84
RMSE in-sample = 0.22
RMSE out-of-sample = 0.68
RMSE in-sample = 0.36
RMSE out-of-sample = 0.30
RMSE in-sample = 0.14
RMSE out-of-sample = 0.47
RMSE in-sample = 0.19
RMSE out-of-sample = 0.56
RMSE in-sample = 0.09
RMSE out-of-sample = 0.15
RMSE in-sample = 0.26
RMSE out-of-sample = 0.58
RMSE in-sample = 0.22
RMSE out-of-sample = 0.38
In [569]:
trainResults = trainResults.append(trainResultKFModel)
testResults = testResults.append(testResultKFModel)
In [585]:
ax=trainResults.plot(figsize=(10,6))
ax.set_xticks(range(len(trainResults.index.values)+1))
ax.set_xticklabels(trainResults.index.values)
plt.title('RMSE In sample')
plt.tight_layout()
plt.show()

trainResults
Out[585]:
y1 y2 y3 y4 y5
LR_PCx1 0.152051 0.189995 0.184832 0.114813 0.141190
LR_PCx2 0.133582 0.216253 0.212217 0.128466 0.104705
LR_PCx3 0.299190 0.408789 0.377306 0.273323 0.268133
LR_PCx4 0.136021 0.189945 0.177009 0.122126 0.114034
LR_PCx5 0.496102 0.535281 0.454352 0.432800 0.470991
LR_PCx6 0.477755 0.581159 0.515503 0.419247 0.446309
MLR 0.070084 0.076230 0.080298 0.044433 0.036322
Lasso 0.068102 0.235692 0.310884 0.040744 0.036117
KF_PCx1 0.289935 0.269008 0.196185 0.288686 0.361297
KF_PCx2 0.176399 0.205967 0.186587 0.149158 0.135770
KF_PCx3 0.189564 0.218618 0.189663 0.181603 0.191458
KF_PCx4 0.116774 0.148138 0.128808 0.102697 0.092158
KF_PCx5 0.273345 0.293798 0.245752 0.248808 0.263777
KF_PCx6 0.240985 0.273791 0.236919 0.217032 0.224081
In [584]:
ax = testResults.plot(figsize=(10,6))
ax.set_xticks(range(len(testResults.index.values)))
ax.set_xticklabels(testResults.index.values)
plt.title('RMSE Out of sample')
plt.tight_layout()
plt.show()

testResults
Out[584]:
y1 y2 y3 y4 y5
LR_PCx1 0.437942 0.367888 0.302783 0.398135 0.257347
LR_PCx2 0.425370 0.338904 0.625755 0.732607 0.426990
LR_PCx3 0.661684 0.540318 0.608685 0.778618 0.519273
LR_PCx4 0.287331 0.286109 0.328546 0.430902 0.161760
LR_PCx5 0.532020 0.675560 0.977715 0.848494 0.566461
LR_PCx6 0.915018 0.716783 0.475248 0.685394 0.653452
MLR 0.289490 0.562417 0.423087 0.375912 0.087440
Lasso 0.254450 0.191252 0.429687 0.361414 0.102481
KF_PCx1 0.328219 0.270949 0.520914 0.537374 0.303672
KF_PCx2 0.449613 0.515790 0.756666 0.768613 0.465263
KF_PCx3 0.582245 0.568921 0.765783 0.847418 0.559718
KF_PCx4 0.227008 0.261601 0.384042 0.465642 0.146436
KF_PCx5 0.498131 0.589429 0.793769 0.842017 0.578970
KF_PCx6 0.458926 0.419628 0.673435 0.675145 0.382738

Model 4.2: Kalman filter with optimized parameters and prediction based on last state vector values

DESCRIPTION

This model is clearly overfitting the training set, even if the number of iterations of the EM algorithm is just 1.ù

In [624]:
results = {}
results['Kalman Filter Optimized'] = {}
results['Kalman Filter Optimized']['Train'] = {}
results['Kalman Filter Optimized']['Test'] = {}

# loop over the y to be predicted
for y_i in df_y.columns:
    
    results['Kalman Filter Optimized']['Train'][y_i] = {} 
    results['Kalman Filter Optimized']['Test'][y_i] = {} 
    
    # loop over the singel PCs used as predictors in a linear regression model
    for PCx_j in df_PCx.columns:
        
        # retrieve test series for training and prediction
        x_train, x_test, y_train, y_test = setTrainAndTestSeries(x=df_PCx,y=df_y,
                                                                 col_x=PCx_j,col_y=y_i)
        
        # run the linear regression and retrieve predicted y (both in- and out- of sample)
        KFModelFitted, y_train_pred, y_test_pred, RMSE_train, RMSE_test, _, _ = runKalmanFilterModel(x_train, x_test, 
                                                                                                     y_train, y_test, 
                                                                                                     IO=True,
                                                                                                     optimized=True)
        
        # plot model predictions (in- and out- of sample, for comparison)
        plotModel(x_train, x_test, y_train, y_test, y_train_pred, y_test_pred)
        
        results['Kalman Filter Optimized']['Train'][y_i]['KF_Opt_'+PCx_j] = RMSE_train
        results['Kalman Filter Optimized']['Test'][y_i]['KF_Opt_'+PCx_j] = RMSE_test

trainResultKF_Opt_Model = pd.DataFrame(results['Kalman Filter Optimized']['Train'])
testResultKF_Opt_Model = pd.DataFrame(results['Kalman Filter Optimized']['Test'])
trainResultKF_Opt_Model.name = 'KF_Opt'
testResultKF_Opt_Model.name = 'KF_Opt'
RMSE in-sample = 0.02
RMSE out-of-sample = 0.45
RMSE in-sample = 0.02
RMSE out-of-sample = 0.44
RMSE in-sample = 0.02
RMSE out-of-sample = 0.53
RMSE in-sample = 0.01
RMSE out-of-sample = 0.41
RMSE in-sample = 0.02
RMSE out-of-sample = 0.46
RMSE in-sample = 0.02
RMSE out-of-sample = 0.50
RMSE in-sample = 0.02
RMSE out-of-sample = 0.53
RMSE in-sample = 0.02
RMSE out-of-sample = 0.57
RMSE in-sample = 0.02
RMSE out-of-sample = 0.62
RMSE in-sample = 0.01
RMSE out-of-sample = 0.56
RMSE in-sample = 0.02
RMSE out-of-sample = 0.57
RMSE in-sample = 0.02
RMSE out-of-sample = 0.61
RMSE in-sample = 0.02
RMSE out-of-sample = 0.61
RMSE in-sample = 0.02
RMSE out-of-sample = 0.61
RMSE in-sample = 0.02
RMSE out-of-sample = 0.68
RMSE in-sample = 0.01
RMSE out-of-sample = 0.73
RMSE in-sample = 0.01
RMSE out-of-sample = 0.59
RMSE in-sample = 0.02
RMSE out-of-sample = 0.69
RMSE in-sample = 0.02
RMSE out-of-sample = 0.77
RMSE in-sample = 0.01
RMSE out-of-sample = 0.75
RMSE in-sample = 0.02
RMSE out-of-sample = 0.83
RMSE in-sample = 0.01
RMSE out-of-sample = 0.76
RMSE in-sample = 0.02
RMSE out-of-sample = 0.74
RMSE in-sample = 0.03
RMSE out-of-sample = 0.82
RMSE in-sample = 0.02
RMSE out-of-sample = 0.54
RMSE in-sample = 0.01
RMSE out-of-sample = 0.48
RMSE in-sample = 0.02
RMSE out-of-sample = 0.56
RMSE in-sample = 0.01
RMSE out-of-sample = 0.45
RMSE in-sample = 0.02
RMSE out-of-sample = 0.46
RMSE in-sample = 0.03
RMSE out-of-sample = 0.58
In [625]:
trainResults = trainResults.append(trainResultKF_Opt_Model)
testResults = testResults.append(testResultKF_Opt_Model)
In [626]:
print('Best model in-sample by y:')
print(trainResults.idxmin())

ax=trainResults.plot(figsize=(15,6))
ax.set_xticks(range(len(trainResults.index.values)+1))
ax.set_xticklabels(trainResults.index.values)
plt.title('RMSE In sample')
plt.xticks(rotation=-90)
plt.tight_layout()
plt.show()

trainResults
Best model in-sample by y:
y1    KF_Opt_PCx4
y2    KF_Opt_PCx4
y3    KF_Opt_PCx4
y4    KF_Opt_PCx4
y5    KF_Opt_PCx4
dtype: object
Out[626]:
y1 y2 y3 y4 y5
LR_PCx1 0.152051 0.189995 0.184832 0.114813 0.141190
LR_PCx2 0.133582 0.216253 0.212217 0.128466 0.104705
LR_PCx3 0.299190 0.408789 0.377306 0.273323 0.268133
LR_PCx4 0.136021 0.189945 0.177009 0.122126 0.114034
LR_PCx5 0.496102 0.535281 0.454352 0.432800 0.470991
LR_PCx6 0.477755 0.581159 0.515503 0.419247 0.446309
MLR 0.070084 0.076230 0.080298 0.044433 0.036322
Lasso 0.068102 0.235692 0.310884 0.040744 0.036117
KF_PCx1 0.289935 0.269008 0.196185 0.288686 0.361297
KF_PCx2 0.176399 0.205967 0.186587 0.149158 0.135770
KF_PCx3 0.189564 0.218618 0.189663 0.181603 0.191458
KF_PCx4 0.116774 0.148138 0.128808 0.102697 0.092158
KF_PCx5 0.273345 0.293798 0.245752 0.248808 0.263777
KF_PCx6 0.240985 0.273791 0.236919 0.217032 0.224081
KF_PCx1 0.289935 0.269008 0.196185 0.288686 0.361297
KF_PCx2 0.176399 0.205967 0.186587 0.149158 0.135770
KF_PCx3 0.189564 0.218618 0.189663 0.181603 0.191458
KF_PCx4 0.116774 0.148138 0.128808 0.102697 0.092158
KF_PCx5 0.273345 0.293798 0.245752 0.248808 0.263777
KF_PCx6 0.240985 0.273791 0.236919 0.217032 0.224081
KF_Opt_PCx1 0.013994 0.014169 0.014960 0.013739 0.013833
KF_Opt_PCx2 0.013172 0.013233 0.014014 0.012511 0.012373
KF_Opt_PCx3 0.016192 0.016520 0.016785 0.016090 0.016133
KF_Opt_PCx4 0.004975 0.005591 0.005969 0.004485 0.004692
KF_Opt_PCx5 0.010207 0.010687 0.011197 0.009575 0.009490
KF_Opt_PCx6 0.020737 0.021098 0.021358 0.020692 0.020559
KF_Opt_PCx1 0.017049 0.017275 0.017081 0.017379 0.018397
KF_Opt_PCx2 0.015452 0.015479 0.016078 0.014911 0.014992
KF_Opt_PCx3 0.020361 0.020821 0.018037 0.021970 0.024178
KF_Opt_PCx4 0.008887 0.009588 0.007087 0.010046 0.011892
KF_Opt_PCx5 0.016011 0.016644 0.013882 0.017216 0.019389
KF_Opt_PCx6 0.024365 0.024845 0.023025 0.025492 0.027030
In [627]:
print('Best model out-of-sample by y:')
print(testResults.idxmin())

ax = testResults.plot(figsize=(15,6))
ax.set_xticks(range(len(testResults.index.values)))
ax.set_xticklabels(testResults.index.values)
plt.title('RMSE Out of sample')
plt.xticks(rotation=-90)
plt.tight_layout()
plt.show()

testResults
Best model out-of-sample by y:
y1    KF_PCx4
y2      Lasso
y3    LR_PCx1
y4      Lasso
y5        MLR
dtype: object
Out[627]:
y1 y2 y3 y4 y5
LR_PCx1 0.437942 0.367888 0.302783 0.398135 0.257347
LR_PCx2 0.425370 0.338904 0.625755 0.732607 0.426990
LR_PCx3 0.661684 0.540318 0.608685 0.778618 0.519273
LR_PCx4 0.287331 0.286109 0.328546 0.430902 0.161760
LR_PCx5 0.532020 0.675560 0.977715 0.848494 0.566461
LR_PCx6 0.915018 0.716783 0.475248 0.685394 0.653452
MLR 0.289490 0.562417 0.423087 0.375912 0.087440
Lasso 0.254450 0.191252 0.429687 0.361414 0.102481
KF_PCx1 0.328219 0.270949 0.520914 0.537374 0.303672
KF_PCx2 0.449613 0.515790 0.756666 0.768613 0.465263
KF_PCx3 0.582245 0.568921 0.765783 0.847418 0.559718
KF_PCx4 0.227008 0.261601 0.384042 0.465642 0.146436
KF_PCx5 0.498131 0.589429 0.793769 0.842017 0.578970
KF_PCx6 0.458926 0.419628 0.673435 0.675145 0.382738
KF_PCx1 0.328219 0.270949 0.520914 0.537374 0.303672
KF_PCx2 0.449613 0.515790 0.756666 0.768613 0.465263
KF_PCx3 0.582245 0.568921 0.765783 0.847418 0.559718
KF_PCx4 0.227008 0.261601 0.384042 0.465642 0.146436
KF_PCx5 0.498131 0.589429 0.793769 0.842017 0.578970
KF_PCx6 0.458926 0.419628 0.673435 0.675145 0.382738
KF_Opt_PCx1 0.464509 0.535009 0.639628 0.799349 0.557460
KF_Opt_PCx2 0.433673 0.553638 0.599344 0.743909 0.474283
KF_Opt_PCx3 0.552535 0.624186 0.690978 0.840714 0.571509
KF_Opt_PCx4 0.422673 0.578359 0.770133 0.779683 0.464409
KF_Opt_PCx5 0.482902 0.659332 0.634247 0.783160 0.521293
KF_Opt_PCx6 0.493137 0.603221 0.673571 0.815979 0.579638
KF_Opt_PCx1 0.450076 0.534665 0.606763 0.766517 0.537255
KF_Opt_PCx2 0.440140 0.574102 0.607294 0.750569 0.483752
KF_Opt_PCx3 0.529478 0.621788 0.683448 0.828328 0.564524
KF_Opt_PCx4 0.408223 0.557448 0.734340 0.763457 0.448832
KF_Opt_PCx5 0.455686 0.570737 0.587280 0.744570 0.460816
KF_Opt_PCx6 0.496379 0.610864 0.692022 0.822042 0.578890

Model 4.3: Kalman filter with online prediction

uses x(t) to predict y(t)

In [695]:
results = {}
results['Kalman Filter Online'] = {}
results['Kalman Filter Online']['Train'] = {}
results['Kalman Filter Online']['Test'] = {}

# loop over the y to be predicted
for y_i in df_y.columns:
    
    results['Kalman Filter Online']['Train'][y_i] = {} 
    results['Kalman Filter Online']['Test'][y_i] = {} 
    
    # loop over the singel PCs used as predictors in a linear regression model
    for PCx_j in df_PCx.columns:
        
        # retrieve test series for training and prediction
        x_train, x_test, y_train, y_test = setTrainAndTestSeries(x=df_PCx,y=df_y,
                                                                 col_x=PCx_j,col_y=y_i)
        
        # run the linear regression and retrieve predicted y (both in- and out- of sample)
        KFModelFitted, y_train_pred, y_test_pred, RMSE_train, RMSE_test, _, _ = runKalmanFilterModel(x_train, x_test, 
                                                                                                     y_train, y_test, 
                                                                                                     IO=True,
                                                                                                     online_prediction=True)
        
        # plot model predictions (in- and out- of sample, for comparison)
        plotModel(x_train, x_test, y_train, y_test, y_train_pred, y_test_pred)
        
        results['Kalman Filter Online']['Train'][y_i]['KF_Online_'+PCx_j] = RMSE_train
        results['Kalman Filter Online']['Test'][y_i]['KF_Online_'+PCx_j] = RMSE_test

trainResultKF_Online_Model = pd.DataFrame(results['Kalman Filter Online']['Train'])
testResultKF_Online_Model = pd.DataFrame(results['Kalman Filter Online']['Test'])
trainResultKF_Online_Model.name = 'KF_Online'
testResultKF_Online_Model.name = 'KF_Online'
RMSE in-sample = 0.29
RMSE out-of-sample = 0.26
RMSE in-sample = 0.18
RMSE out-of-sample = 0.27
RMSE in-sample = 0.19
RMSE out-of-sample = 0.31
RMSE in-sample = 0.12
RMSE out-of-sample = 0.18
RMSE in-sample = 0.27
RMSE out-of-sample = 0.33
RMSE in-sample = 0.24
RMSE out-of-sample = 0.34
RMSE in-sample = 0.27
RMSE out-of-sample = 0.24
RMSE in-sample = 0.21
RMSE out-of-sample = 0.25
RMSE in-sample = 0.22
RMSE out-of-sample = 0.31
RMSE in-sample = 0.15
RMSE out-of-sample = 0.20
RMSE in-sample = 0.29
RMSE out-of-sample = 0.34
RMSE in-sample = 0.27
RMSE out-of-sample = 0.34
RMSE in-sample = 0.20
RMSE out-of-sample = 0.27
RMSE in-sample = 0.19
RMSE out-of-sample = 0.26
RMSE in-sample = 0.19
RMSE out-of-sample = 0.32
RMSE in-sample = 0.13
RMSE out-of-sample = 0.19
RMSE in-sample = 0.25
RMSE out-of-sample = 0.34
RMSE in-sample = 0.24
RMSE out-of-sample = 0.36
RMSE in-sample = 0.29
RMSE out-of-sample = 0.27
RMSE in-sample = 0.15
RMSE out-of-sample = 0.35
RMSE in-sample = 0.18
RMSE out-of-sample = 0.38
RMSE in-sample = 0.10
RMSE out-of-sample = 0.30
RMSE in-sample = 0.25
RMSE out-of-sample = 0.44
RMSE in-sample = 0.22
RMSE out-of-sample = 0.43
RMSE in-sample = 0.36
RMSE out-of-sample = 0.25
RMSE in-sample = 0.14
RMSE out-of-sample = 0.23
RMSE in-sample = 0.19
RMSE out-of-sample = 0.30
RMSE in-sample = 0.09
RMSE out-of-sample = 0.16
RMSE in-sample = 0.26
RMSE out-of-sample = 0.32
RMSE in-sample = 0.22
RMSE out-of-sample = 0.32
In [684]:
trainResults = trainResults.append(trainResultKF_Online_Model)
testResults = testResults.append(testResultKF_Online_Model)
In [685]:
print('Best model in-sample by y:')
print(trainResults.idxmin())

ax=trainResults.plot(figsize=(15,6))
ax.set_xticks(range(len(trainResults.index.values)+1))
ax.set_xticklabels(trainResults.index.values)
plt.title('RMSE In sample')
plt.xticks(rotation=-90)
plt.tight_layout()
plt.show()

trainResults
Best model in-sample by y:
y1    KF_Opt_PCx4
y2    KF_Opt_PCx4
y3    KF_Opt_PCx4
y4    KF_Opt_PCx4
y5    KF_Opt_PCx4
dtype: object
Out[685]:
y1 y2 y3 y4 y5
LR_PCx1 0.152051 0.189995 0.184832 0.114813 0.141190
LR_PCx2 0.133582 0.216253 0.212217 0.128466 0.104705
LR_PCx3 0.299190 0.408789 0.377306 0.273323 0.268133
LR_PCx4 0.136021 0.189945 0.177009 0.122126 0.114034
LR_PCx5 0.496102 0.535281 0.454352 0.432800 0.470991
LR_PCx6 0.477755 0.581159 0.515503 0.419247 0.446309
MLR 0.070084 0.076230 0.080298 0.044433 0.036322
Lasso 0.068102 0.235692 0.310884 0.040744 0.036117
KF_PCx1 0.289935 0.269008 0.196185 0.288686 0.361297
KF_PCx2 0.176399 0.205967 0.186587 0.149158 0.135770
KF_PCx3 0.189564 0.218618 0.189663 0.181603 0.191458
KF_PCx4 0.116774 0.148138 0.128808 0.102697 0.092158
KF_PCx5 0.273345 0.293798 0.245752 0.248808 0.263777
KF_PCx6 0.240985 0.273791 0.236919 0.217032 0.224081
KF_PCx1 0.289935 0.269008 0.196185 0.288686 0.361297
KF_PCx2 0.176399 0.205967 0.186587 0.149158 0.135770
KF_PCx3 0.189564 0.218618 0.189663 0.181603 0.191458
KF_PCx4 0.116774 0.148138 0.128808 0.102697 0.092158
KF_PCx5 0.273345 0.293798 0.245752 0.248808 0.263777
KF_PCx6 0.240985 0.273791 0.236919 0.217032 0.224081
KF_Opt_PCx1 0.013994 0.014169 0.014960 0.013739 0.013833
KF_Opt_PCx2 0.013172 0.013233 0.014014 0.012511 0.012373
KF_Opt_PCx3 0.016192 0.016520 0.016785 0.016090 0.016133
KF_Opt_PCx4 0.004975 0.005591 0.005969 0.004485 0.004692
KF_Opt_PCx5 0.010207 0.010687 0.011197 0.009575 0.009490
KF_Opt_PCx6 0.020737 0.021098 0.021358 0.020692 0.020559
KF_Opt_PCx1 0.017049 0.017275 0.017081 0.017379 0.018397
KF_Opt_PCx2 0.015452 0.015479 0.016078 0.014911 0.014992
KF_Opt_PCx3 0.020361 0.020821 0.018037 0.021970 0.024178
KF_Opt_PCx4 0.008887 0.009588 0.007087 0.010046 0.011892
KF_Opt_PCx5 0.016011 0.016644 0.013882 0.017216 0.019389
KF_Opt_PCx6 0.024365 0.024845 0.023025 0.025492 0.027030
KF_Online_PCx1 0.289935 0.269008 0.196185 0.288686 0.361297
KF_Online_PCx2 0.176399 0.205967 0.186587 0.149158 0.135770
KF_Online_PCx3 0.189564 0.218618 0.189663 0.181603 0.191458
KF_Online_PCx4 0.116774 0.148138 0.128808 0.102697 0.092158
KF_Online_PCx5 0.273345 0.293798 0.245752 0.248808 0.263777
KF_Online_PCx6 0.240985 0.273791 0.236919 0.217032 0.224081
KF_Online_PCx1 0.289935 0.269008 0.196185 0.288686 0.361297
KF_Online_PCx2 0.176399 0.205967 0.186587 0.149158 0.135770
KF_Online_PCx3 0.189564 0.218618 0.189663 0.181603 0.191458
KF_Online_PCx4 0.116774 0.148138 0.128808 0.102697 0.092158
KF_Online_PCx5 0.273345 0.293798 0.245752 0.248808 0.263777
KF_Online_PCx6 0.240985 0.273791 0.236919 0.217032 0.224081
In [686]:
print('Best model out-of-sample by y:')
print(testResults.idxmin())

ax = testResults.plot(figsize=(15,6))
ax.set_xticks(range(len(testResults.index.values)))
ax.set_xticklabels(testResults.index.values)
plt.title('RMSE Out of sample')
plt.xticks(rotation=-90)
plt.tight_layout()
plt.show()

testResults
Best model out-of-sample by y:
y1           KF_PCx4
y2             Lasso
y3    KF_Online_PCx4
y4             Lasso
y5               MLR
dtype: object
Out[686]:
y1 y2 y3 y4 y5
LR_PCx1 0.437942 0.367888 0.302783 0.398135 0.257347
LR_PCx2 0.425370 0.338904 0.625755 0.732607 0.426990
LR_PCx3 0.661684 0.540318 0.608685 0.778618 0.519273
LR_PCx4 0.287331 0.286109 0.328546 0.430902 0.161760
LR_PCx5 0.532020 0.675560 0.977715 0.848494 0.566461
LR_PCx6 0.915018 0.716783 0.475248 0.685394 0.653452
MLR 0.289490 0.562417 0.423087 0.375912 0.087440
Lasso 0.254450 0.191252 0.429687 0.361414 0.102481
KF_PCx1 0.328219 0.270949 0.520914 0.537374 0.303672
KF_PCx2 0.449613 0.515790 0.756666 0.768613 0.465263
KF_PCx3 0.582245 0.568921 0.765783 0.847418 0.559718
KF_PCx4 0.227008 0.261601 0.384042 0.465642 0.146436
KF_PCx5 0.498131 0.589429 0.793769 0.842017 0.578970
KF_PCx6 0.458926 0.419628 0.673435 0.675145 0.382738
KF_PCx1 0.328219 0.270949 0.520914 0.537374 0.303672
KF_PCx2 0.449613 0.515790 0.756666 0.768613 0.465263
KF_PCx3 0.582245 0.568921 0.765783 0.847418 0.559718
KF_PCx4 0.227008 0.261601 0.384042 0.465642 0.146436
KF_PCx5 0.498131 0.589429 0.793769 0.842017 0.578970
KF_PCx6 0.458926 0.419628 0.673435 0.675145 0.382738
KF_Opt_PCx1 0.464509 0.535009 0.639628 0.799349 0.557460
KF_Opt_PCx2 0.433673 0.553638 0.599344 0.743909 0.474283
KF_Opt_PCx3 0.552535 0.624186 0.690978 0.840714 0.571509
KF_Opt_PCx4 0.422673 0.578359 0.770133 0.779683 0.464409
KF_Opt_PCx5 0.482902 0.659332 0.634247 0.783160 0.521293
KF_Opt_PCx6 0.493137 0.603221 0.673571 0.815979 0.579638
KF_Opt_PCx1 0.450076 0.534665 0.606763 0.766517 0.537255
KF_Opt_PCx2 0.440140 0.574102 0.607294 0.750569 0.483752
KF_Opt_PCx3 0.529478 0.621788 0.683448 0.828328 0.564524
KF_Opt_PCx4 0.408223 0.557448 0.734340 0.763457 0.448832
KF_Opt_PCx5 0.455686 0.570737 0.587280 0.744570 0.460816
KF_Opt_PCx6 0.496379 0.610864 0.692022 0.822042 0.578890
KF_Online_PCx1 0.328219 0.270949 0.520914 0.537374 0.303672
KF_Online_PCx2 0.449613 0.515790 0.756666 0.768613 0.465263
KF_Online_PCx3 0.582245 0.568921 0.765783 0.847418 0.559718
KF_Online_PCx4 0.227008 0.261601 0.384042 0.465642 0.146436
KF_Online_PCx5 0.498131 0.589429 0.793769 0.842017 0.578970
KF_Online_PCx6 0.458926 0.419628 0.673435 0.675145 0.382738
KF_Online_PCx1 0.447164 0.490879 0.472013 0.559697 0.332471
KF_Online_PCx2 0.448488 0.527868 0.807952 0.876193 0.396673
KF_Online_PCx3 0.772723 0.829353 0.872155 0.756805 0.837411
KF_Online_PCx4 0.400731 0.529434 0.270028 0.377151 0.780011
KF_Online_PCx5 0.662572 0.513115 0.886252 1.069733 0.435036
KF_Online_PCx6 0.485880 0.707878 0.646885 0.857839 0.341195
In [687]:
y_test[10]
Out[687]:
0.014084506779909134
In [690]:
y_test[10-5:11].mean()
Out[690]:
0.01056338008493185
In [693]:
y_train.tail(5).values.mean()
Out[693]:
0.02394366227090359